From 4f8a6dd61c58a9075f7feb487e0976f429668b55 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Thu, 29 Aug 2019 11:09:20 -0700 Subject: [PATCH] Implemented "Greedy by size planner" for that produces more optimal memory allocation. - in Mobilenet V1 has 35% has memory reduction - in Mobilenet V2 has 2% reduction Majority of all other tested models demonstrated ~10-15% improvement. PiperOrigin-RevId: 266182266 --- tensorflow/lite/BUILD | 1 - tensorflow/lite/allocation.h | 2 +- tensorflow/lite/arena_planner.cc | 211 ++++++++-------- tensorflow/lite/arena_planner.h | 52 ++-- tensorflow/lite/arena_planner_test.cc | 254 ++++++++++++-------- tensorflow/lite/core/subgraph.cc | 25 +- tensorflow/lite/core/subgraph.h | 8 - tensorflow/lite/interpreter_test.cc | 15 +- tensorflow/lite/memory_planner.h | 5 - tensorflow/lite/simple_memory_arena.cc | 78 +++--- tensorflow/lite/simple_memory_arena.h | 33 +-- tensorflow/lite/simple_memory_arena_test.cc | 58 ++--- 12 files changed, 367 insertions(+), 375 deletions(-) diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD index e353edd121e..c62d7ec9219 100644 --- a/tensorflow/lite/BUILD +++ b/tensorflow/lite/BUILD @@ -174,7 +174,6 @@ cc_library( ], copts = TFLITE_DEFAULT_COPTS, deps = [ - ":simple_memory_arena", ":string", "//tensorflow/lite/c:c_api_internal", "//tensorflow/lite/core/api", diff --git a/tensorflow/lite/allocation.h b/tensorflow/lite/allocation.h index baf9ac3d421..d76207bbdac 100644 --- a/tensorflow/lite/allocation.h +++ b/tensorflow/lite/allocation.h @@ -19,11 +19,11 @@ limitations under the License. #include #include +#include #include #include "tensorflow/lite/c/c_api_internal.h" #include "tensorflow/lite/core/api/error_reporter.h" -#include "tensorflow/lite/simple_memory_arena.h" #include "tensorflow/lite/string.h" namespace tflite { diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc index 3258f612c18..e1a042b2188 100644 --- a/tensorflow/lite/arena_planner.cc +++ b/tensorflow/lite/arena_planner.cc @@ -13,18 +13,18 @@ See the License for the specific language governing permissions and limitations under the License. ==============================================================================*/ #include "tensorflow/lite/arena_planner.h" + +#include +#include +#include #include namespace tflite { +namespace { -struct AllocationInfo { - // The node index requesting this allocation. - int node; - // The tensor index to be allocated or deallocated. - int tensor; - // Whether to allocate or deallocate - enum Type { ALLOC, DEALLOC } type; -}; +constexpr size_t kNotAssigned = std::numeric_limits::max(); + +} // namespace ArenaPlanner::ArenaPlanner(TfLiteContext* context, std::unique_ptr graph_info, @@ -55,52 +55,42 @@ TfLiteStatus ArenaPlanner::ResetAllocations() { TF_LITE_ENSURE_STATUS(persistent_arena_.Clear()); allocs_.clear(); allocs_.resize(graph_info_->num_tensors()); - // Note that we only clear the alloc_queue_ when re-planning allocations, as - // it should only change when the graph topology itself changes. + order_.clear(); + was_added_.clear(); return kTfLiteOk; } TfLiteStatus ArenaPlanner::PlanAllocations() { // Invalidate any existing data. TF_LITE_ENSURE_STATUS(ResetAllocations()); - // The alloc_queue_ is specific to the graph topology, and will be - // completely reconstructed from graph data here. - alloc_queue_.clear(); + // Maybe other verb instead of 'Assigned' + alloc_node_.assign(graph_info_->num_tensors(), kNotAssigned); + dealloc_node_.assign(graph_info_->num_tensors(), kNotAssigned); // Keeps track of references to each tensor. std::vector refcounts(graph_info_->num_tensors(), 0); - // `allocated` and `deallocated` are technically list of boolean values. - // We're saving the compiled binary size by using `vector`. - std::vector allocated(graph_info_->num_tensors(), false); - std::vector deallocated(graph_info_->num_tensors(), false); - auto allocate = [this, &allocated, &deallocated](int node, - int tensor) -> TfLiteStatus { - if (allocated[tensor]) { + auto allocate = [this](int node, int tensor) -> TfLiteStatus { + if (alloc_node_[tensor] != kNotAssigned) { + // Tensor has already been allocated. return kTfLiteOk; } - TF_LITE_ENSURE(context_, !deallocated[tensor]); - alloc_queue_.push_back({node, tensor, AllocationInfo::ALLOC}); - allocated[tensor] = true; + TF_LITE_ENSURE(context_, dealloc_node_[tensor] == kNotAssigned); + alloc_node_[tensor] = node; return kTfLiteOk; }; - auto deallocate = [this, &allocated, &deallocated]( - int node, int tensor) -> TfLiteStatus { - if (!allocated[tensor]) { - // Do not enqueue a DEALLOC if the tensor is never allocated. + auto deallocate = [this](int node, int tensor) -> TfLiteStatus { + if (alloc_node_[tensor] == kNotAssigned) { + // We don't need to deallocate the tensor, that is never allocated. // This happened with the constant tensors. return kTfLiteOk; } - TF_LITE_ENSURE(context_, !deallocated[tensor]); - alloc_queue_.push_back({node, tensor, AllocationInfo::DEALLOC}); + TF_LITE_ENSURE(context_, dealloc_node_[tensor] == kNotAssigned); + dealloc_node_[tensor] = node; return kTfLiteOk; }; - // There will be an entry in alloc_queue_ for the allocation of each tensor - // and another for their deallocation. - alloc_queue_.reserve(2 * graph_info_->num_tensors()); - // We must make sure the output tensors are never overwritten. We do that by // artificially adding one to their ref-counts so they are never selected // for deallocation. @@ -188,12 +178,27 @@ TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) { // Grow the size of `allocs_` if necessary. This allows allocating temporary // tensors in op's `prepare` function. TF_LITE_ENSURE(context_, graph_info_->num_tensors() >= allocs_.size()); + alloc_node_.resize(graph_info_->num_tensors(), kNotAssigned); + dealloc_node_.resize(graph_info_->num_tensors(), kNotAssigned); allocs_.resize(graph_info_->num_tensors()); + was_added_.assign(graph_info_->num_tensors(), false); + order_.clear(); + // Set allocation and deallocation for temporary tensors. + for (size_t i = first_node; i <= last_node && i < graph_info_->num_nodes(); + ++i) { + const TfLiteNode& node = graph_info_->node(i); + TfLiteIntArray* node_temporaries = node.temporaries; + for (int j = 0; j < node_temporaries->size; ++j) { + int tensor_index = node_temporaries->data[j]; + alloc_node_[tensor_index] = i; + dealloc_node_[tensor_index] = i; + } + } TF_LITE_ENSURE_STATUS(CalculateAllocations(first_node, last_node)); TF_LITE_ENSURE_STATUS(Commit()); - for (int i = 0; i < static_cast(graph_info_->num_tensors()); ++i) { + for (size_t i = 0; i < graph_info_->num_tensors(); ++i) { // TODO(ahentz): we could do this only for the tensors that were modified // in CalculateAllocations(), instead of redoing it for tensors that // already had proper pointers. However we must be very careful, because @@ -204,48 +209,71 @@ TfLiteStatus ArenaPlanner::ExecuteAllocations(int first_node, int last_node) { return kTfLiteOk; } -TfLiteStatus ArenaPlanner::Commit() { - TF_LITE_ENSURE_STATUS(arena_.Commit(context_)); - TF_LITE_ENSURE_STATUS(persistent_arena_.Commit(context_)); +TfLiteStatus ArenaPlanner::CalculateAllocations(int first_node, int last_node) { + for (size_t i = 0; i < graph_info_->num_tensors(); ++i) { + if (alloc_node_[i] >= first_node && alloc_node_[i] <= last_node) { + AddTensorIfNeeded(i); + } + } + + std::sort(order_.begin(), order_.end(), CompareBySize(this)); + + // Vector of ids of already allocated tensors, ordered by offset. + for (const auto& tensor_index : order_) { + TfLiteTensor& tensor = *graph_info_->tensor(tensor_index); + if (tensor.allocation_type == kTfLiteArenaRw) { + TF_LITE_ENSURE_STATUS(arena_.Allocate( + context_, tensor_alignment_, tensor.bytes, alloc_node_[tensor_index], + dealloc_node_[tensor_index], &allocs_[tensor_index])); + } + if (tensor.allocation_type == kTfLiteArenaRwPersistent) { + TF_LITE_ENSURE_STATUS(persistent_arena_.Allocate( + context_, tensor_alignment_, tensor.bytes, alloc_node_[tensor_index], + std::numeric_limits::max(), &allocs_[tensor_index])); + } + } return kTfLiteOk; } -TfLiteStatus ArenaPlanner::CalculateAllocations(int first_node, int last_node) { - int active_node = first_node; - // When dynamic tensors are present this method is called multiple times. - // The items in the alloc_queue_ referring to nodes before first_node were - // processed previously and should be skipped. Entries after last_node are - // not yet ready to be handled. - for (const auto& alloc_info : alloc_queue_) { - if (alloc_info.node < first_node) continue; - if (alloc_info.node > last_node) break; - if (alloc_info.node == active_node) { - // This is the first allocation/deallocation for a given node. It is - // time to deallocate the previous temporaries and allocate new ones. - if (active_node != first_node) { - TF_LITE_ENSURE_STATUS( - CalculateDeallocationOfInternalTensors(active_node - 1)); - } - TF_LITE_ENSURE_STATUS(CalculateAllocationOfInternalTensors(active_node)); - ++active_node; - } - // Handle the current item. - if (alloc_info.type == AllocationInfo::ALLOC) { - TF_LITE_ENSURE_STATUS(CalculateTensorAllocation(alloc_info.tensor)); - } else { - TF_LITE_ENSURE_STATUS(CalculateTensorDeallocation(alloc_info.tensor)); +void ArenaPlanner::AddTensorIfNeeded(int tensor_index) { + if (!was_added_[tensor_index]) { + was_added_[tensor_index] = true; + order_.push_back(tensor_index); + } +} + +bool ArenaPlanner::CompareBySize::operator()(const int idx1, + const int idx2) const { + // Tensors that have lifespan through the whole model inference time are + // allocated at the beginning of memory slice. Their respective order doesn't + // matter in fact, so here they are sorted by index. + if (planner->alloc_node_[idx1] == 0 && + planner->dealloc_node_[idx1] == kNotAssigned) { + if (planner->alloc_node_[idx2] == 0 && + planner->dealloc_node_[idx2] == kNotAssigned) { + return idx1 < idx2; } + return true; + } + if (planner->alloc_node_[idx2] == 0 && + planner->dealloc_node_[idx2] == kNotAssigned) { + return false; } - // For the case if the graph is empty the node index can be negative since we - // substract from the active node, so the node_index can be zero for those - // cases - if (active_node > 0) { - // Don't forget to deallocate temporaries of last node. - TF_LITE_ENSURE_STATUS( - CalculateDeallocationOfInternalTensors(active_node - 1)); + // All other tensors are sorted in non-increasing order of their size. + auto size1 = planner->graph_info_->tensor(idx1)->bytes; + auto size2 = planner->graph_info_->tensor(idx2)->bytes; + if (size1 != size2) { + return size1 > size2; } + // Tensors with equal size are sorted in order of their allocation time. + return planner->alloc_node_[idx1] < planner->alloc_node_[idx2]; +} + +TfLiteStatus ArenaPlanner::Commit() { + TF_LITE_ENSURE_STATUS(arena_.Commit(context_)); + TF_LITE_ENSURE_STATUS(persistent_arena_.Commit(context_)); return kTfLiteOk; } @@ -266,51 +294,4 @@ TfLiteStatus ArenaPlanner::ResolveTensorAllocation(int tensor_index) { return kTfLiteOk; } -TfLiteStatus ArenaPlanner::CalculateTensorAllocation(int tensor_index) { - TfLiteTensor& tensor = *graph_info_->tensor(tensor_index); - if (tensor.allocation_type == kTfLiteArenaRw) { - TF_LITE_ENSURE_STATUS(arena_.Allocate( - context_, tensor_alignment_, tensor.bytes, &allocs_[tensor_index])); - } - if (tensor.allocation_type == kTfLiteArenaRwPersistent) { - TF_LITE_ENSURE_STATUS(persistent_arena_.Allocate( - context_, tensor_alignment_, tensor.bytes, &allocs_[tensor_index])); - } - return kTfLiteOk; -} - -TfLiteStatus ArenaPlanner::CalculateTensorDeallocation(int tensor_index) { - TfLiteTensor& tensor = *graph_info_->tensor(tensor_index); - if (tensor.allocation_type == kTfLiteArenaRw) { - TF_LITE_ENSURE_STATUS(arena_.Deallocate(context_, allocs_[tensor_index])); - } - return kTfLiteOk; -} - -TfLiteStatus ArenaPlanner::CalculateAllocationOfInternalTensors( - int node_index) { - if (node_index < static_cast(graph_info_->num_nodes())) { - const TfLiteNode& node = graph_info_->node(static_cast(node_index)); - TfLiteIntArray* node_temporaries = node.temporaries; - for (int i = 0; i < node_temporaries->size; ++i) { - int tensor_index = node_temporaries->data[i]; - TF_LITE_ENSURE_STATUS(CalculateTensorAllocation(tensor_index)); - } - } - return kTfLiteOk; -} - -TfLiteStatus ArenaPlanner::CalculateDeallocationOfInternalTensors( - int node_index) { - if (node_index < static_cast(graph_info_->num_nodes())) { - const TfLiteNode& node = graph_info_->node(static_cast(node_index)); - TfLiteIntArray* node_temporaries = node.temporaries; - for (int i = 0; i < node_temporaries->size; ++i) { - int tensor_index = node_temporaries->data[i]; - TF_LITE_ENSURE_STATUS(CalculateTensorDeallocation(tensor_index)); - } - } - return kTfLiteOk; -} - } // namespace tflite diff --git a/tensorflow/lite/arena_planner.h b/tensorflow/lite/arena_planner.h index 569e5d98db2..31f9fc6d8e3 100644 --- a/tensorflow/lite/arena_planner.h +++ b/tensorflow/lite/arena_planner.h @@ -15,6 +15,7 @@ limitations under the License. #ifndef TENSORFLOW_LITE_ARENA_PLANNER_H_ #define TENSORFLOW_LITE_ARENA_PLANNER_H_ +#include #include #include @@ -44,17 +45,12 @@ struct AllocationInfo; // execution. Since dynamic tensors don't have sizes until after the // corresponding operation is executed, this class supports incremental // planning. -// -// TODO(b/127354079): Remove the constrain below when the issue is fixed. -// WARNING: MemoryPlanner's behavior must be deterministic. If the first N -// nodes are unchanged, it must produce exactly the same allocation plan for -// the first N nodes. class ArenaPlanner : public MemoryPlanner { public: // Ownership of 'context' is not taken and it must remain util the - // ArenaPlanner is destroyed. If 'preserve_inputs' is true the inputs to the - // graph will not share memory with any other tensor, effectively preserving - // them until the end of inference. + // ArenaPlanner is destroyed. If 'preserve_inputs' is true the inputs + // to the graph will not share memory with any other tensor, effectively + // preserving them until the end of inference. ArenaPlanner(TfLiteContext* context, std::unique_ptr graph_info, bool preserve_inputs, bool preserve_intermediates, int tensor_alignment = kDefaultTensorAlignment); @@ -82,29 +78,37 @@ class ArenaPlanner : public MemoryPlanner { // position inside the corresponding arena buffer. TfLiteStatus ResolveTensorAllocation(int tensor_index); - // Register an allocation for the given tensor. - TfLiteStatus CalculateTensorAllocation(int tensor_index); + void AddTensorIfNeeded(int tensor_index); - // Register a deallocation for the given tensor. - TfLiteStatus CalculateTensorDeallocation(int tensor_index); - - // Register an allocation for all internal (temporary) tensors of - // 'node_index'. - TfLiteStatus CalculateAllocationOfInternalTensors(int node_index); - - // Register a deallocation for all internal (temporary) tensors of - // 'node_index'. - TfLiteStatus CalculateDeallocationOfInternalTensors(int node_index); + // Comparator to sort tensors for the allocation algorithm: + // - Tensors that have lifespan through the whole model inference time go + // first; + // - Other tensors (e.g. intermediate and temporary ones) are sorted in + // non-increasing order of their size. If sizes of two tensors are equal, the + // one that needs to be allocated earlier goes first. + struct CompareBySize { + explicit CompareBySize(const ArenaPlanner* planner) : planner(planner) {} + bool operator()(const int idx1, const int idx2) const; + const ArenaPlanner* planner; + }; TfLiteContext* context_; std::unique_ptr graph_info_; // Stores allocation data for all tensors. - std::vector allocs_; + std::vector allocs_; - // A chronological list of instructions to allocate and deallocate tensors, - // reflecting the way they are used in the graph. - std::vector alloc_queue_; + // First node, that uses the tensor. It needs to be allocated before + // execution of the node's operation. + std::vector alloc_node_; + + // Last node, that uses the tensor. It can be deallocated after execution of + // the node's operation. + std::vector dealloc_node_; + + // Indices of tensors in order their allocation offsets will be calculated. + std::vector order_; + std::vector was_added_; // avoiding std::vector as bitset // Raw memory buffer that is allocated for all temporary and graph outputs // that are declared kTfLiteArenaRw. diff --git a/tensorflow/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc index 0e80d429c0d..83a08c0a487 100644 --- a/tensorflow/lite/arena_planner_test.cc +++ b/tensorflow/lite/arena_planner_test.cc @@ -18,8 +18,8 @@ limitations under the License. #include #include -#include "tensorflow/lite/testing/util.h" #include "tensorflow/core/platform/logging.h" +#include "tensorflow/lite/testing/util.h" namespace tflite { namespace { @@ -198,7 +198,7 @@ class ArenaPlannerTest : public ::testing::Test { offset += kTensorAlignment - offset % kTensorAlignment; } return offset; - }; + } TfLiteContext context_; TestGraph* graph_; @@ -211,18 +211,6 @@ TEST_F(ArenaPlannerTest, EmptyGraph) { Execute(0, 10); } -TEST_F(ArenaPlannerTest, DeallocationOfInputTensor) { - // This is a negative TC, which will try to make sure that no allocation for - // input tensors is done, when making call with negative node_index, since - // previous check was doing comparison of node_index which was int and - // unsigned int, implicit conversion was passing this case, as the negative - // number was converted to unsigned it making it invalid.The new check - // takes care of this problem and removes the warning as well. - TestGraph graph({-1}, {}, {1}); - SetGraph(&graph); - Execute(0, 10); -} - TEST_F(ArenaPlannerTest, GraphWithNoOps) { TestGraph graph({0, 10}, {}, {5, 11}); SetGraph(&graph); @@ -239,8 +227,8 @@ TEST_F(ArenaPlannerTest, GraphWithOneOp) { TestGraph graph({1}, {{{1}, {2}, {}}}, {2}); SetGraph(&graph); Execute(0, 10); - EXPECT_EQ(GetOffset(1), 0); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(1)); + EXPECT_EQ(GetOffset(2), 0); + EXPECT_EQ(GetOffset(1), GetOffsetAfter(2)); } TEST_F(ArenaPlannerTest, ZeroSizedTensors) { @@ -264,12 +252,12 @@ TEST_F(ArenaPlannerTest, SimpleGraph) { Execute(0, 10); // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +4 +5 -2 -0 +3 -4 -5 - EXPECT_EQ(GetOffset(0), 0); - EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(1)); - EXPECT_EQ(GetOffset(4), GetOffsetAfter(2)); - EXPECT_EQ(GetOffset(5), GetOffsetAfter(4)); - EXPECT_EQ(GetOffset(3), 0); + EXPECT_EQ(GetOffset(5), 0); + EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); + EXPECT_EQ(GetOffset(3), GetOffsetAfter(4)); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(4)); + EXPECT_EQ(GetOffset(0), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(1), 0); } TEST_F(ArenaPlannerTest, SimpleGraphInputsPreserved) { @@ -287,13 +275,10 @@ TEST_F(ArenaPlannerTest, SimpleGraphInputsPreserved) { // Alloc(+) and dealloc(-) order: +0 +1 +2 +4 +5 -2 +3 -4 -5 EXPECT_EQ(GetOffset(0), 0); EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(1)); - EXPECT_EQ(GetOffset(4), GetOffsetAfter(2)); - EXPECT_EQ(GetOffset(5), GetOffsetAfter(4)); - // Because we are keeping the inputs alive until the end (due to - // preserve_inputs=true), the output tensor will not be able to use that - // space. It will end up using the same are as tensor #2. - EXPECT_EQ(GetOffset(3), GetOffsetAfter(1)); + EXPECT_EQ(GetOffset(5), GetOffsetAfter(1)); + EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); + EXPECT_EQ(GetOffset(3), GetOffsetAfter(4)); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(4)); } TEST_F(ArenaPlannerTest, SimpleGraphWithTemporary) { @@ -309,12 +294,12 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithTemporary) { Execute(0, 10); // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4 - EXPECT_EQ(GetOffset(0), 0); - EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(1)); - EXPECT_EQ(GetOffset(5), GetOffsetAfter(2)); - EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); EXPECT_EQ(GetOffset(3), 0); + EXPECT_EQ(GetOffset(5), 0); + EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(4)); + EXPECT_EQ(GetOffset(0), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(1), 0); } TEST_F(ArenaPlannerTest, SimpleGraphWithOptionals) { @@ -330,12 +315,12 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithOptionals) { Execute(0, 10); // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +4 +5 -2 -0 +3 -4 -5 - EXPECT_EQ(GetOffset(0), 0); - EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(1)); - EXPECT_EQ(GetOffset(4), GetOffsetAfter(2)); - EXPECT_EQ(GetOffset(5), GetOffsetAfter(4)); - EXPECT_EQ(GetOffset(3), 0); + EXPECT_EQ(GetOffset(5), 0); + EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); + EXPECT_EQ(GetOffset(3), GetOffsetAfter(4)); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(4)); + EXPECT_EQ(GetOffset(0), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(1), 0); } TEST_F(ArenaPlannerTest, SimpleGraphWithLargeTensor) { @@ -355,12 +340,12 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithLargeTensor) { Execute(0, 10); // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4 - EXPECT_EQ(GetOffset(0), 0); - EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); + EXPECT_EQ(GetOffset(1), 0); EXPECT_EQ(GetOffset(2), GetOffsetAfter(1)); - EXPECT_EQ(GetOffset(5), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); + EXPECT_EQ(GetOffset(0), GetOffsetAfter(2)); EXPECT_EQ(GetOffset(3), 0); + EXPECT_EQ(GetOffset(5), 0); + EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); } TEST_F(ArenaPlannerTest, SimpleGraphWithPersistentTensor) { @@ -386,12 +371,12 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithPersistentTensor) { EXPECT_NE((*graph.tensors())[0].data.raw, (*graph.tensors())[1].data.raw); // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4 - EXPECT_EQ(GetOffset(0), 0); - EXPECT_EQ(GetOffset(1), 0); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(5), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(5), 0); EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); EXPECT_EQ(GetOffset(3), 0); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(4)); + EXPECT_EQ(GetOffset(0), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(1), 0); } TEST_F(ArenaPlannerTest, SimpleGraphWithDynamicTensor) { @@ -413,11 +398,11 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithDynamicTensor) { EXPECT_EQ((*graph.tensors())[1].data.raw, nullptr); // Alloc(+) and dealloc(-) order: +0 +1 +2 -1 +5 +4 -2 -0 -5 +3 -4 - EXPECT_EQ(GetOffset(0), 0); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(5), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(5), 0); EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); EXPECT_EQ(GetOffset(3), 0); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(4)); + EXPECT_EQ(GetOffset(0), GetOffsetAfter(2)); } TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) { @@ -446,10 +431,10 @@ TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) { // Op4: +10 -4 -5 -8 Execute(0, 0); - EXPECT_EQ(GetOffset(0), 0); - EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(1)); - EXPECT_EQ(GetOffset(3), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(3), 0); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(3)); + EXPECT_EQ(GetOffset(1), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(0), GetOffsetAfter(1)); EXPECT_TRUE(is_unallocated(6)); EXPECT_TRUE(is_unallocated(4)); EXPECT_TRUE(is_unallocated(5)); @@ -459,62 +444,61 @@ TEST_F(ArenaPlannerTest, LargerGraphAndStepwiseAllocation) { EXPECT_TRUE(is_unallocated(10)); Execute(1, 1); - EXPECT_EQ(GetOffset(0), 0); - EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(1)); - EXPECT_EQ(GetOffset(3), GetOffsetAfter(2)); - EXPECT_EQ(GetOffset(6), GetOffsetAfter(3)); - EXPECT_EQ(GetOffset(4), GetOffsetAfter(6)); - EXPECT_EQ(GetOffset(5), GetOffsetAfter(4)); + EXPECT_EQ(GetOffset(3), 0); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(3)); + EXPECT_EQ(GetOffset(1), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(0), GetOffsetAfter(1)); + EXPECT_EQ(GetOffset(6), GetOffsetAfter(0)); + EXPECT_EQ(GetOffset(5), GetOffsetAfter(6)); + EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); EXPECT_TRUE(is_unallocated(7)); EXPECT_TRUE(is_unallocated(9)); EXPECT_TRUE(is_unallocated(8)); EXPECT_TRUE(is_unallocated(10)); Execute(2, 2); - EXPECT_EQ(GetOffset(0), 0); - EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(1)); - EXPECT_EQ(GetOffset(3), GetOffsetAfter(2)); - EXPECT_EQ(GetOffset(6), GetOffsetAfter(3)); - EXPECT_EQ(GetOffset(4), GetOffsetAfter(6)); - EXPECT_EQ(GetOffset(5), GetOffsetAfter(4)); - // Here's an interesting allocation. Even though #6 requires only 21 bytes, - // its deallocation freed up 24 bytes due to the alignment requirements in - // the arena. That means we can fit #7 in the same space! - EXPECT_EQ(GetOffset(7), GetOffsetAfter(3)); + EXPECT_EQ(GetOffset(3), 0); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(3)); + EXPECT_EQ(GetOffset(1), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(0), GetOffsetAfter(1)); + EXPECT_EQ(GetOffset(6), GetOffsetAfter(0)); + EXPECT_EQ(GetOffset(5), GetOffsetAfter(6)); + EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); + // #7 (24 bytes) is allocated at the place, where #0 and #6 (4+24=28 bytes) + // were before their deallocation. + EXPECT_EQ(GetOffset(7), GetOffsetAfter(1)); EXPECT_TRUE(is_unallocated(9)); EXPECT_TRUE(is_unallocated(8)); EXPECT_TRUE(is_unallocated(10)); Execute(3, 3); - EXPECT_EQ(GetOffset(0), 0); - EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(1)); - EXPECT_EQ(GetOffset(3), GetOffsetAfter(2)); - EXPECT_EQ(GetOffset(6), GetOffsetAfter(3)); - EXPECT_EQ(GetOffset(4), GetOffsetAfter(6)); - EXPECT_EQ(GetOffset(5), GetOffsetAfter(4)); - EXPECT_EQ(GetOffset(7), GetOffsetAfter(3)); - // The deallocation of #0, #1 and #2 freed up 24 bytes but that's not enough - // for #9, so it goes at the end. - EXPECT_EQ(GetOffset(9), GetOffsetAfter(5)); + EXPECT_EQ(GetOffset(3), 0); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(3)); + EXPECT_EQ(GetOffset(1), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(0), GetOffsetAfter(1)); + EXPECT_EQ(GetOffset(6), GetOffsetAfter(0)); + EXPECT_EQ(GetOffset(5), GetOffsetAfter(6)); + EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); + EXPECT_EQ(GetOffset(7), GetOffsetAfter(1)); + // The deallocation of #1 and #2 frees up 20 bytes but that's not enough + // neither for #9, nor for #8, so they both go at the end. + EXPECT_EQ(GetOffset(9), GetOffsetAfter(4)); EXPECT_EQ(GetOffset(8), GetOffsetAfter(9)); EXPECT_TRUE(is_unallocated(10)); Execute(4, 4); - EXPECT_EQ(GetOffset(0), 0); - EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); - EXPECT_EQ(GetOffset(2), GetOffsetAfter(1)); - EXPECT_EQ(GetOffset(3), GetOffsetAfter(2)); - EXPECT_EQ(GetOffset(6), GetOffsetAfter(3)); - EXPECT_EQ(GetOffset(4), GetOffsetAfter(6)); - EXPECT_EQ(GetOffset(5), GetOffsetAfter(4)); - EXPECT_EQ(GetOffset(7), GetOffsetAfter(3)); - EXPECT_EQ(GetOffset(9), GetOffsetAfter(5)); + EXPECT_EQ(GetOffset(3), 0); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(3)); + EXPECT_EQ(GetOffset(1), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(0), GetOffsetAfter(1)); + EXPECT_EQ(GetOffset(6), GetOffsetAfter(0)); + EXPECT_EQ(GetOffset(5), GetOffsetAfter(6)); + EXPECT_EQ(GetOffset(4), GetOffsetAfter(5)); + EXPECT_EQ(GetOffset(7), GetOffsetAfter(1)); + EXPECT_EQ(GetOffset(9), GetOffsetAfter(4)); EXPECT_EQ(GetOffset(8), GetOffsetAfter(9)); - // There's just enough space at the beginning for #10 due to the - // deallocation of #0, #1, #2 and #3 (total 36 bytes, #10 needs + // There is enough space at the beginning for #10 due to the + // deallocation of #7, #1, #2 and #3 (total 56 bytes, #10 needs // only 33.) EXPECT_EQ(GetOffset(10), 0); } @@ -547,6 +531,86 @@ TEST_F(ArenaPlannerTest, ModifiedGraph) { EXPECT_EQ(GetOffset(3), GetOffsetAfter(1)); } +TEST_F(ArenaPlannerTest, ComplexGraph) { + TestGraph graph({0}, + { + /* in, out, tmp */ + {{0}, {1}, {}}, + {{1}, {2}, {}}, + {{1}, {3}, {}}, + {{1}, {4}, {}}, + {{2, 3, 4}, {5}, {}}, + {{5}, {6}, {}}, + {{5}, {7}, {}}, + {{6, 7}, {8}, {}}, + }, + {8}); + (*graph.tensors())[0].bytes = 32; + (*graph.tensors())[1].bytes = 28; + (*graph.tensors())[2].bytes = 36; + (*graph.tensors())[3].bytes = 16; + (*graph.tensors())[4].bytes = 8; + (*graph.tensors())[5].bytes = 64; + (*graph.tensors())[6].bytes = 10; + (*graph.tensors())[7].bytes = 40; + SetGraph(&graph); + Execute(0, 10); + + // Alloc(+) and dealloc(-) order: +0 +1 -0 +2 +3 +4 -1 +5 -2 -3 -4 +6 +7 -5 +8 + EXPECT_EQ(GetOffset(5), 0); + EXPECT_EQ(GetOffset(7), GetOffsetAfter(5)); + EXPECT_EQ(GetOffset(6), GetOffsetAfter(7)); + EXPECT_EQ(GetOffset(2), GetOffsetAfter(5)); + EXPECT_EQ(GetOffset(3), GetOffsetAfter(2)); + EXPECT_EQ(GetOffset(4), GetOffsetAfter(3)); + EXPECT_EQ(GetOffset(0), 0); + EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); + EXPECT_EQ(GetOffset(8), 0); +} + +TEST_F(ArenaPlannerTest, GraphWithIntermediates) { + TestGraph graph({0, 1}, + { + /* in, out, tmp */ + {{0}, {2}, {3}}, + {{1, 2}, {4, 5}, {}}, + {{5}, {6, 7}, {8, 9, 10}}, + {{4, 6}, {11}, {12}}, + {{11}, {13}, {}}, + {{7, 13}, {14}, {15}}, + }, + {11, 14}); + SetGraph(&graph, /*preserve_inputs=*/true); + Execute(0, 10); + + // Alloc(+) and dealloc(-) order by operation: + // Op0: +0 +1 +2 +3 -3 + // Op1: +4 +5 -2 -4 + // Op2: +6 +7 +8 +9 +10 -8 -9 -10 -5 + // Op3: +11 +12 -12 -4 -6 + // Op4: +13 + // Op5: +14 +15 -7 -13 -15 + EXPECT_EQ(GetOffset(0), 0); + EXPECT_EQ(GetOffset(1), GetOffsetAfter(0)); + EXPECT_EQ(GetOffset(15), GetOffsetAfter(1)); + EXPECT_EQ(GetOffset(14), GetOffsetAfter(15)); + EXPECT_EQ(GetOffset(13), GetOffsetAfter(14)); + EXPECT_EQ(GetOffset(12), GetOffsetAfter(1)); + EXPECT_EQ(GetOffset(11), GetOffsetAfter(13)); + EXPECT_EQ(GetOffset(10), GetOffsetAfter(1)); + EXPECT_EQ(GetOffset(9), GetOffsetAfter(10)); + EXPECT_EQ(GetOffset(8), GetOffsetAfter(9)); + EXPECT_EQ(GetOffset(7), GetOffsetAfter(11)); + EXPECT_EQ(GetOffset(6), GetOffsetAfter(8)); + EXPECT_EQ(GetOffset(5), GetOffsetAfter(6)); + EXPECT_EQ(GetOffset(4), GetOffsetAfter(7)); + EXPECT_EQ(GetOffset(3), GetOffsetAfter(1)); + + // 2 is allocated in the smallest suitable gap, which is not equal to the + // first available one. + EXPECT_EQ(GetOffset(2), GetOffsetAfter(5)); +} + } // namespace } // namespace tflite diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc index d2c6b874702..6097e1fd7d1 100644 --- a/tensorflow/lite/core/subgraph.cc +++ b/tensorflow/lite/core/subgraph.cc @@ -162,7 +162,6 @@ Subgraph::Subgraph(ErrorReporter* error_reporter, : external_contexts_(external_contexts), error_reporter_(error_reporter), next_execution_plan_index_to_prepare_(0), - next_execution_plan_index_to_plan_allocation_(0), subgraphs_(subgraphs), resource_variables_(resource_variables) { context_.impl_ = static_cast(this); @@ -496,7 +495,6 @@ TfLiteStatus Subgraph::AllocateTensors() { } next_execution_plan_index_to_prepare_ = 0; - next_execution_plan_index_to_plan_allocation_ = 0; if (memory_planner_) { TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations()); } @@ -695,13 +693,10 @@ TfLiteStatus Subgraph::PrepareOpsAndTensors() { TF_LITE_ENSURE_STATUS(PrepareOpsStartingAt( next_execution_plan_index_to_prepare_, &last_exec_plan_index_prepared)); - next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1; TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations( - next_execution_plan_index_to_plan_allocation_, - last_exec_plan_index_prepared)); - next_execution_plan_index_to_plan_allocation_ = - last_exec_plan_index_prepared + 1; + next_execution_plan_index_to_prepare_, last_exec_plan_index_prepared)); + next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1; return kTfLiteOk; } @@ -776,22 +771,6 @@ TfLiteStatus Subgraph::Invoke() { if (tensor_resized_since_op_invoke_ && HasDynamicTensor(context_, node.outputs)) { next_execution_plan_index_to_prepare_ = execution_plan_index + 1; - - // This happens when an intermediate dynamic tensor is resized. - // We don't have to prepare all the ops, but we need to recompute - // the allocation plan. - // - // This is a workaround for b/127354079. It relies on the property that - // ArenaPlanner's behavior is deterministic. A better solution is being - // able to "Rewind" to a specific index in ArenaPlanner. - // TODO(b/127354079): Improve ArenaPlanner and remove this mechanism. - if (next_execution_plan_index_to_plan_allocation_ > - next_execution_plan_index_to_prepare_) { - next_execution_plan_index_to_plan_allocation_ = 0; - if (memory_planner_) { - TF_LITE_ENSURE_STATUS(memory_planner_->ResetAllocations()); - } - } } } diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h index b9736d89f9a..c80a0f355c9 100644 --- a/tensorflow/lite/core/subgraph.h +++ b/tensorflow/lite/core/subgraph.h @@ -527,14 +527,6 @@ class Subgraph { // NOTE: this relies on the order of nodes that is in topological order. int next_execution_plan_index_to_prepare_; - // This is similar to `next_execution_plan_index_to_prepare_`, but it tracks - // which nodes' allocation is planned with the arena planner. - // - // This is a workaround for b/127354079. It shouldn't be necessary if - // ArenaPlanner can "rewind" to a specific point. - // TODO(b/127354079): Improve ArenaPlanner and remove this mechanism. - int next_execution_plan_index_to_plan_allocation_; - // WARNING: This is an experimental interface that is subject to change. // This is a list of node indices (to index into nodes_and_registration). // This represents a valid topological sort (dependency ordered) execution diff --git a/tensorflow/lite/interpreter_test.cc b/tensorflow/lite/interpreter_test.cc index f6d8bae4eff..fa303549363 100644 --- a/tensorflow/lite/interpreter_test.cc +++ b/tensorflow/lite/interpreter_test.cc @@ -364,15 +364,14 @@ TEST(BasicInterpreter, CheckArenaAllocation) { ASSERT_EQ(interpreter.AllocateTensors(), kTfLiteOk); ASSERT_LT(interpreter.tensor(0)->data.raw, interpreter.tensor(1)->data.raw); - ASSERT_LT(interpreter.tensor(1)->data.raw, interpreter.tensor(2)->data.raw); - ASSERT_LT(interpreter.tensor(2)->data.raw, interpreter.tensor(3)->data.raw); - ASSERT_LT(interpreter.tensor(3)->data.raw, interpreter.tensor(4)->data.raw); - ASSERT_LT(interpreter.tensor(4)->data.raw, interpreter.tensor(5)->data.raw); - ASSERT_LT(interpreter.tensor(5)->data.raw, interpreter.tensor(7)->data.raw); - ASSERT_EQ(interpreter.tensor(6)->data.raw, interpreter.tensor(2)->data.raw); - // #7 is the one with the largest pointer. + ASSERT_LT(interpreter.tensor(1)->data.raw, interpreter.tensor(3)->data.raw); + ASSERT_EQ(interpreter.tensor(3)->data.raw, interpreter.tensor(9)->data.raw); + ASSERT_LT(interpreter.tensor(3)->data.raw, interpreter.tensor(5)->data.raw); + ASSERT_LT(interpreter.tensor(5)->data.raw, interpreter.tensor(2)->data.raw); + ASSERT_EQ(interpreter.tensor(2)->data.raw, interpreter.tensor(7)->data.raw); + ASSERT_LT(interpreter.tensor(2)->data.raw, interpreter.tensor(4)->data.raw); + // #4 is the one with the largest pointer. ASSERT_EQ(interpreter.tensor(8)->data.raw, nullptr); - ASSERT_EQ(interpreter.tensor(9)->data.raw, interpreter.tensor(5)->data.raw); } TEST(BasicInterpreter, BufferAccess) { diff --git a/tensorflow/lite/memory_planner.h b/tensorflow/lite/memory_planner.h index 2c2b357d52f..fa2a44a1c89 100644 --- a/tensorflow/lite/memory_planner.h +++ b/tensorflow/lite/memory_planner.h @@ -21,11 +21,6 @@ namespace tflite { // A MemoryPlanner is responsible for planning and executing a number of // memory-related operations that are necessary in TF Lite. -// -// TODO(b/127354079): Remove the constrain below when the issue is fixed. -// WARNING: MemoryPlanner's behavior must be deterministic. If the first N -// nodes are unchanged, it must produce exactly the same allocation plan for -// the first N nodes. class MemoryPlanner { public: virtual ~MemoryPlanner() {} diff --git a/tensorflow/lite/simple_memory_arena.cc b/tensorflow/lite/simple_memory_arena.cc index 88bdf50c9b6..167a845cd23 100644 --- a/tensorflow/lite/simple_memory_arena.cc +++ b/tensorflow/lite/simple_memory_arena.cc @@ -31,73 +31,55 @@ T AlignTo(size_t alignment, T offset) { } // namespace namespace tflite { - -TfLiteStatus SimpleMemoryArena::Allocate(TfLiteContext* context, - size_t alignment, size_t size, - ArenaAlloc* new_alloc) { +TfLiteStatus SimpleMemoryArena::Allocate( + TfLiteContext* context, size_t alignment, size_t size, size_t first_node, + size_t last_node, ArenaAllocWithUsageInterval* new_alloc) { TF_LITE_ENSURE(context, alignment <= arena_alignment_); + new_alloc->first_node = first_node; + new_alloc->last_node = last_node; + new_alloc->size = size; if (size == 0) { new_alloc->offset = 0; - new_alloc->size = 0; return kTfLiteOk; } - size_t current_top = 0; - - if (!allocs_.empty()) { - auto last = allocs_.rbegin(); - current_top = last->offset + last->size; - } - // If we don't find a better gap just allocate at the end of the buffer. - size_t best_offset = AlignTo(alignment, current_top); - size_t best_offset_fit = std::numeric_limits::max(); - auto best_insertion_it = allocs_.end(); + const size_t kNotAssigned = std::numeric_limits::max(); + size_t best_offset = kNotAssigned; + size_t best_offset_fit = kNotAssigned; // Go through the sorted allocs and look at the gaps between them. size_t current_offset = 0; - for (auto it = allocs_.begin(); it != allocs_.end(); ++it) { + for (const auto& alloc : ordered_allocs_) { + if (alloc.last_node < first_node || alloc.first_node > last_node) { + // Usage interval of alloc doesn't intersect with current tensor's usage + // interval, so we skip it. + continue; + } size_t aligned_current_offset = AlignTo(alignment, current_offset); // If we found a gap larger than required size, and smaller than previous // best fit, take it. - if (aligned_current_offset + size <= it->offset && - it->offset - current_offset < best_offset_fit) { + if (aligned_current_offset + size <= alloc.offset && + alloc.offset - aligned_current_offset < best_offset_fit) { best_offset = aligned_current_offset; - best_offset_fit = it->offset - current_offset; - best_insertion_it = it; + best_offset_fit = alloc.offset - current_offset; } - current_offset = it->offset + it->size; + current_offset = std::max(current_offset, alloc.offset + alloc.size); + } + if (best_offset == kNotAssigned) { + best_offset = AlignTo(alignment, current_offset); } // Update the required buffer size. high_water_mark_ = std::max(high_water_mark_, best_offset + size); - new_alloc->offset = best_offset; - new_alloc->size = size; - allocs_.insert(best_insertion_it, *new_alloc); - return kTfLiteOk; -} - -TfLiteStatus SimpleMemoryArena::Deallocate(TfLiteContext* context, - const ArenaAlloc& alloc) { - if (alloc.size == 0) { - return kTfLiteOk; + auto insertion_it = ordered_allocs_.begin(); + while (insertion_it != ordered_allocs_.end() && *insertion_it < *new_alloc) { + ++insertion_it; } - - int erased_allocs_count = 0; - auto it = allocs_.begin(); - while (it != allocs_.end()) { - if (it->offset == alloc.offset) { - TF_LITE_ENSURE_EQ(context, it->size, alloc.size); - erased_allocs_count++; - it = allocs_.erase(it); - } else { - ++it; - } - } - TF_LITE_ENSURE_EQ(context, erased_allocs_count, 1); + ordered_allocs_.insert(insertion_it, *new_alloc); return kTfLiteOk; } @@ -128,9 +110,9 @@ TfLiteStatus SimpleMemoryArena::Commit(TfLiteContext* context) { return underlying_buffer_ != nullptr ? kTfLiteOk : kTfLiteError; } -TfLiteStatus SimpleMemoryArena::ResolveAlloc(TfLiteContext* context, - const ArenaAlloc& alloc, - char** output_ptr) { +TfLiteStatus SimpleMemoryArena::ResolveAlloc( + TfLiteContext* context, const ArenaAllocWithUsageInterval& alloc, + char** output_ptr) { TF_LITE_ENSURE(context, committed_); TF_LITE_ENSURE(context, output_ptr != nullptr); if (alloc.size == 0) { @@ -144,7 +126,7 @@ TfLiteStatus SimpleMemoryArena::ResolveAlloc(TfLiteContext* context, TfLiteStatus SimpleMemoryArena::Clear() { committed_ = false; high_water_mark_ = 0; - allocs_.clear(); + ordered_allocs_.clear(); return kTfLiteOk; } diff --git a/tensorflow/lite/simple_memory_arena.h b/tensorflow/lite/simple_memory_arena.h index 42203c0c0a3..6fec52d15b0 100644 --- a/tensorflow/lite/simple_memory_arena.h +++ b/tensorflow/lite/simple_memory_arena.h @@ -22,16 +22,19 @@ limitations under the License. namespace tflite { // This little structure holds the offset and the size for a dynamic memory -// allocation in the memory arena. When the arena is committed and the -// underlying buffer is set, the alloc can be resolved into an actual memory -// pointer. -struct ArenaAlloc { - ArenaAlloc() : offset(0), size(0) {} - +// allocation in the memory arena as well as first_node and last_node that use +// corresponding tensor. It means that continuous part of memory with this size +// needs to be allocated before execution of operation in the first node and can +// be deallocated after execution of the operation in the last_node. When the +// arena is committed and the underlying buffer is set, the alloc can be +// resolved into an actual memory pointer. +struct ArenaAllocWithUsageInterval { size_t offset; size_t size; + size_t first_node; + size_t last_node; - inline bool operator<(const ArenaAlloc& other) const { + inline bool operator<(const ArenaAllocWithUsageInterval& other) const { return offset < other.offset; } }; @@ -48,12 +51,14 @@ class SimpleMemoryArena { arena_alignment_(arena_alignment), high_water_mark_(0), underlying_buffer_size_(0), - allocs_() {} + ordered_allocs_() {} + // Schedule memory allocation for a tensor with a given size, assuming that it + // needs to be allocated before the execution of first_node, and deallocated + // after the execution of last_node. TfLiteStatus Allocate(TfLiteContext* context, size_t alignment, size_t size, - ArenaAlloc* new_alloc); - - TfLiteStatus Deallocate(TfLiteContext* context, const ArenaAlloc& alloc); + size_t first_node, size_t last_node, + ArenaAllocWithUsageInterval* new_alloc); inline size_t RequiredBufferSize() { // Add in a small amount of padding to reduce the chance of resize events @@ -64,7 +69,8 @@ class SimpleMemoryArena { TfLiteStatus Commit(TfLiteContext* context); - TfLiteStatus ResolveAlloc(TfLiteContext* context, const ArenaAlloc& alloc, + TfLiteStatus ResolveAlloc(TfLiteContext* context, + const ArenaAllocWithUsageInterval& alloc, char** output_ptr); TfLiteStatus Clear(); @@ -80,8 +86,7 @@ class SimpleMemoryArena { std::unique_ptr underlying_buffer_; size_t underlying_buffer_size_; char* underlying_buffer_aligned_ptr_; - // TODO(maciekc): add list iterator to the ArenaAlloc to lookup quickly. - std::list allocs_; + std::list ordered_allocs_; }; } // namespace tflite diff --git a/tensorflow/lite/simple_memory_arena_test.cc b/tensorflow/lite/simple_memory_arena_test.cc index caf13db2c1a..5300ebe5ea2 100644 --- a/tensorflow/lite/simple_memory_arena_test.cc +++ b/tensorflow/lite/simple_memory_arena_test.cc @@ -24,39 +24,33 @@ namespace { TEST(SimpleMemoryArenaTest, BasicArenaOperations) { TfLiteContext context; SimpleMemoryArena arena(64); - ArenaAlloc allocs[6]; + ArenaAllocWithUsageInterval allocs[6]; - arena.Allocate(&context, 32, 2047, &allocs[0]); - arena.Allocate(&context, 32, 2047, &allocs[1]); - arena.Allocate(&context, 32, 2047, &allocs[2]); - arena.Deallocate(&context, allocs[0]); - arena.Allocate(&context, 32, 1023, &allocs[3]); - arena.Allocate(&context, 32, 2047, &allocs[4]); - arena.Deallocate(&context, allocs[1]); - arena.Allocate(&context, 32, 1023, &allocs[5]); + arena.Allocate(&context, 32, 2047, 1, 3, &allocs[0]); + arena.Allocate(&context, 32, 2047, 2, 5, &allocs[1]); + arena.Allocate(&context, 32, 2047, 3, 6, &allocs[2]); + arena.Allocate(&context, 32, 2047, 5, 6, &allocs[3]); + arena.Allocate(&context, 32, 1023, 4, 6, &allocs[4]); + arena.Allocate(&context, 32, 1023, 6, 6, &allocs[5]); EXPECT_EQ(allocs[0].offset, 0); EXPECT_EQ(allocs[1].offset, 2048); EXPECT_EQ(allocs[2].offset, 4096); EXPECT_EQ(allocs[3].offset, 0); EXPECT_EQ(allocs[4].offset, 6144); - EXPECT_EQ(allocs[5].offset, 1024); + EXPECT_EQ(allocs[5].offset, 2048); } TEST(SimpleMemoryArenaTest, BasicZeroAlloc) { TfLiteContext context; SimpleMemoryArena arena(64); - ArenaAlloc alloc; + ArenaAllocWithUsageInterval alloc; // Zero-sized allocs should have a 0 offset and size. - ASSERT_EQ(arena.Allocate(&context, 32, 0, &alloc), kTfLiteOk); + ASSERT_EQ(arena.Allocate(&context, 32, 0, 1, 2, &alloc), kTfLiteOk); EXPECT_EQ(alloc.offset, 0); EXPECT_EQ(alloc.size, 0); - // Deallocation of zero-sized allocs should always succeed (even redundantly). - ASSERT_EQ(arena.Deallocate(&context, alloc), kTfLiteOk); - ASSERT_EQ(arena.Deallocate(&context, alloc), kTfLiteOk); - // The zero-sized alloc should resolve to null. char* resolved_ptr = nullptr; ASSERT_EQ(arena.Commit(&context), kTfLiteOk); @@ -67,15 +61,13 @@ TEST(SimpleMemoryArenaTest, BasicZeroAlloc) { TEST(SimpleMemoryArenaTest, InterleavedZeroAlloc) { TfLiteContext context; SimpleMemoryArena arena(64); - ArenaAlloc allocs[4]; + ArenaAllocWithUsageInterval allocs[4]; // Interleave some zero and non-zero-sized allocations and deallocations. - ASSERT_EQ(arena.Allocate(&context, 32, 2047, &allocs[0]), kTfLiteOk); - ASSERT_EQ(arena.Allocate(&context, 32, 0, &allocs[1]), kTfLiteOk); - ASSERT_EQ(arena.Allocate(&context, 32, 1023, &allocs[2]), kTfLiteOk); - ASSERT_EQ(arena.Deallocate(&context, allocs[1]), kTfLiteOk); - ASSERT_EQ(arena.Deallocate(&context, allocs[2]), kTfLiteOk); - ASSERT_EQ(arena.Allocate(&context, 32, 2047, &allocs[3]), kTfLiteOk); + ASSERT_EQ(arena.Allocate(&context, 32, 2047, 0, 4, &allocs[0]), kTfLiteOk); + ASSERT_EQ(arena.Allocate(&context, 32, 0, 1, 2, &allocs[1]), kTfLiteOk); + ASSERT_EQ(arena.Allocate(&context, 32, 1023, 1, 2, &allocs[2]), kTfLiteOk); + ASSERT_EQ(arena.Allocate(&context, 32, 2047, 3, 4, &allocs[3]), kTfLiteOk); // Deallocation of a zero-sized alloc should not impact the allocator offsets. EXPECT_EQ(allocs[0].offset, 0); @@ -87,11 +79,11 @@ TEST(SimpleMemoryArenaTest, InterleavedZeroAlloc) { TEST(SimpleMemoryArenaTest, TestAfterClear) { TfLiteContext context; SimpleMemoryArena arena(64); - ArenaAlloc allocs[9]; + ArenaAllocWithUsageInterval allocs[9]; - arena.Allocate(&context, 32, 2047, &allocs[0]); - arena.Allocate(&context, 32, 2047, &allocs[1]); - arena.Allocate(&context, 32, 2047, &allocs[2]); + arena.Allocate(&context, 32, 2047, 0, 2, &allocs[0]); + arena.Allocate(&context, 32, 2047, 1, 2, &allocs[1]); + arena.Allocate(&context, 32, 2047, 1, 2, &allocs[2]); arena.Commit(&context); EXPECT_EQ(allocs[0].offset, 0); @@ -101,9 +93,9 @@ TEST(SimpleMemoryArenaTest, TestAfterClear) { arena.Clear(); // Test with smaller allocs. - arena.Allocate(&context, 32, 1023, &allocs[3]); - arena.Allocate(&context, 32, 1023, &allocs[4]); - arena.Allocate(&context, 32, 1023, &allocs[5]); + arena.Allocate(&context, 32, 1023, 0, 2, &allocs[3]); + arena.Allocate(&context, 32, 1023, 1, 2, &allocs[4]); + arena.Allocate(&context, 32, 1023, 1, 2, &allocs[5]); arena.Commit(&context); EXPECT_EQ(allocs[3].offset, 0); @@ -113,9 +105,9 @@ TEST(SimpleMemoryArenaTest, TestAfterClear) { arena.Clear(); // Test larger allocs which should require a reallocation. - arena.Allocate(&context, 32, 4095, &allocs[6]); - arena.Allocate(&context, 32, 4095, &allocs[7]); - arena.Allocate(&context, 32, 4095, &allocs[8]); + arena.Allocate(&context, 32, 4095, 0, 2, &allocs[6]); + arena.Allocate(&context, 32, 4095, 1, 2, &allocs[7]); + arena.Allocate(&context, 32, 4095, 1, 2, &allocs[8]); arena.Commit(&context); EXPECT_EQ(allocs[6].offset, 0);