diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.cc b/tensorflow/lite/delegates/gpu/common/memory_management.cc index 73a27a3c4ea..4a1444f928e 100644 --- a/tensorflow/lite/delegates/gpu/common/memory_management.cc +++ b/tensorflow/lite/delegates/gpu/common/memory_management.cc @@ -29,7 +29,7 @@ namespace gpu { namespace { struct PoolRecord { - PoolRecord(uint32_t size, size_t obj_id) + PoolRecord(size_t size, size_t obj_id) : object_size(size), object_id(obj_id) {} // Objects in pool are ordered by size. @@ -38,7 +38,7 @@ struct PoolRecord { (object_size == other.object_size && object_id < other.object_id); } - uint32_t object_size; + size_t object_size; size_t object_id; }; @@ -61,8 +61,10 @@ struct QueueRecord { // // The problem of memory management is NP-complete. This implements a // naive algorithm that assigns each tensor to a separate object in memory. -Status NaiveAssignment(const std::vector& usage_records, - ObjectsAssignment* assignment) { +template +Status NaiveAssignment( + const std::vector>& usage_records, + ObjectsAssignment* assignment) { assignment->object_sizes.resize(usage_records.size()); assignment->object_ids.resize(usage_records.size()); for (size_t i = 0; i < usage_records.size(); i++) { @@ -79,7 +81,8 @@ Status NaiveAssignment(const std::vector& usage_records, // greedy algorithm that approximates an optimal solution with following // heuristic: // -// 1. Iterates through all tensor usage records and for every object reference +// 1. Iterates through all tensor usage records and for every object +// reference // assigns shared object from the pool. When object reference is used // for the last time, corresponding shared object is returned back to // the pool. @@ -88,8 +91,9 @@ Status NaiveAssignment(const std::vector& usage_records, // available. // // 3. Shared object size may increase when tensor requests larger size. -Status GreedyAssignment(const std::vector& usage_records, - ObjectsAssignment* assignment) { +Status GreedyAssignment( + const std::vector>& usage_records, + ObjectsAssignment* assignment) { assignment->object_sizes.clear(); assignment->object_ids.resize(usage_records.size()); @@ -108,7 +112,7 @@ Status GreedyAssignment(const std::vector& usage_records, pool.insert({assignment->object_sizes[object_id], object_id}); objects_in_use.pop(); } - uint32_t tensor_size = usage_records[i].tensor_size; + size_t tensor_size = usage_records[i].tensor_size; if (pool.empty()) { // No free shared object, creating a new one, assign i-th tensor to // it and add to the queue of objects in use. @@ -121,7 +125,7 @@ Status GreedyAssignment(const std::vector& usage_records, // Find shared object from pool, that will waste the least possible // amount of memory when reused for current tensor. auto pool_it = pool.lower_bound({tensor_size, 0}); - uint32_t size_diff = 0; + size_t size_diff = 0; if (pool_it != pool.end()) { // Try smallest shared object from pool with size >= tensor_size. size_diff = pool_it->object_size - tensor_size; @@ -139,7 +143,8 @@ Status GreedyAssignment(const std::vector& usage_records, // best_it can't be equal to pool.end(), because pool is not empty if (best_it == pool.end()) { return InternalError( - "No shared object is found in non-empty pool in GreedyAssignment."); + "No shared object is found in non-empty pool in " + "GreedyAssignment."); } size_t shared_id = best_it->object_id; pool.erase(best_it); @@ -158,7 +163,7 @@ class MinCostFlowSolver { public: // Build auxiliary flow graph, based on information about intermediate // tensors. - void Build(const std::vector& usage_records) { + void Build(const std::vector>& usage_records) { usage_records_ = &usage_records; num_tensors_ = usage_records.size(); source_ = 2 * num_tensors_; @@ -167,8 +172,8 @@ class MinCostFlowSolver { std::vector old_record_ids; std::priority_queue objects_in_use; for (size_t i = 0; i < usage_records.size(); i++) { - // Pop from the queue all objects that are no longer in use at the time of - // execution of the first_task of i-th intermediate tensor. + // Pop from the queue all objects that are no longer in use at the time + // of execution of the first_task of i-th intermediate tensor. while (!objects_in_use.empty() && objects_in_use.top().last_task < usage_records[i].first_task) { old_record_ids.push_back(objects_in_use.top().object_id); @@ -186,8 +191,8 @@ class MinCostFlowSolver { // Edges from vertices of the left part of flow graph, corresponding to // old_record_ids, to i-th vertex in the right part of flow graph are // added for the case of reusing previously created shared objects for - // i-th tensor. Cost of these edges is an approximation of the size of new - // allocated memory. + // i-th tensor. Cost of these edges is an approximation of the size of + // new allocated memory. for (auto record_id : old_record_ids) { int cost = 0; if (usage_records[i].tensor_size > @@ -251,7 +256,7 @@ class MinCostFlowSolver { } } - void CalculateAssignment(ObjectsAssignment* assignment) { + void CalculateAssignment(ObjectsAssignment* assignment) { assignment->object_sizes.clear(); assignment->object_ids.resize(num_tensors_); is_tensor_assigned_.resize(num_tensors_); @@ -273,9 +278,9 @@ class MinCostFlowSolver { int cost; }; - // Add edge from vertex src to vertex dst with given capacity and cost and its - // reversed edge to the flow graph. If some edge has index idx, its reversed - // edge has index idx^1. + // Add edge from vertex src to vertex dst with given capacity and cost and + // its reversed edge to the flow graph. If some edge has index idx, its + // reversed edge has index idx^1. void AddEdge(size_t src, size_t dst, int cap, int cost) { edges_from_[src].push_back(edges_.size()); edges_.emplace_back(dst, cap, cost); @@ -288,8 +293,8 @@ class MinCostFlowSolver { return vertex_id >= num_tensors_ && vertex_id < 2 * num_tensors_; } - // Return vertex from another part of the graph, that corresponds to the same - // intermediate tensor. + // Return vertex from another part of the graph, that corresponds to the + // same intermediate tensor. size_t LeftPartTwin(size_t vertex_id) const { return vertex_id - num_tensors_; } @@ -299,13 +304,13 @@ class MinCostFlowSolver { // This function uses recursive implementation of depth-first search and // returns maximum size from tensor tensor_id and all tensors, that will be - // allocated at the same place with it after all operations that use tensor_id - // are executed. Next tensor to be allocated at the same place with tensor_id - // is a left part twin of such vertex v, that the edge tensor_id->v is - // saturated (has zero residual capacity). - uint32_t AssignTensorsToNewSharedObject(size_t tensor_id, - ObjectsAssignment* assignment) { - uint32_t cost = (*usage_records_)[tensor_id].tensor_size; + // allocated at the same place with it after all operations that use + // tensor_id are executed. Next tensor to be allocated at the same place + // with tensor_id is a left part twin of such vertex v, that the edge + // tensor_id->v is saturated (has zero residual capacity). + size_t AssignTensorsToNewSharedObject(size_t tensor_id, + ObjectsAssignment* assignment) { + size_t cost = (*usage_records_)[tensor_id].tensor_size; is_tensor_assigned_[tensor_id] = true; assignment->object_ids[tensor_id] = assignment->object_sizes.size(); for (const auto& edge_id : edges_from_[tensor_id]) { @@ -324,7 +329,7 @@ class MinCostFlowSolver { size_t source_; size_t sink_; size_t num_tensors_; - const std::vector* usage_records_; + const std::vector>* usage_records_; std::vector edges_; std::vector> edges_from_; std::vector is_tensor_assigned_; @@ -337,8 +342,8 @@ class MinCostFlowSolver { // assignment of shared objects to tensors, using the result of the flow // algorithm. Status MinCostFlowAssignment( - const std::vector& usage_records, - ObjectsAssignment* assignment) { + const std::vector>& usage_records, + ObjectsAssignment* assignment) { MinCostFlowSolver solver; solver.Build(usage_records); solver.Solve(); @@ -349,11 +354,11 @@ Status MinCostFlowAssignment( } // namespace Status AssignObjectsToTensors( - const std::vector& usage_records, - const MemoryStrategy& strategy, ObjectsAssignment* assignment) { + const std::vector>& usage_records, + const MemoryStrategy& strategy, ObjectsAssignment* assignment) { switch (strategy) { case MemoryStrategy::NAIVE: - return NaiveAssignment(usage_records, assignment); + return NaiveAssignment(usage_records, assignment); case MemoryStrategy::GREEDY: return GreedyAssignment(usage_records, assignment); case MemoryStrategy::MINCOSTFLOW: diff --git a/tensorflow/lite/delegates/gpu/common/memory_management.h b/tensorflow/lite/delegates/gpu/common/memory_management.h index 4b8023b8d54..299685be00d 100644 --- a/tensorflow/lite/delegates/gpu/common/memory_management.h +++ b/tensorflow/lite/delegates/gpu/common/memory_management.h @@ -28,31 +28,32 @@ namespace gpu { using TaskId = size_t; -// Record, containing tensor size and IDs of the first and the last task, that -// use this tensor as input or output. -// For example: tensor #3 with size tensor_size=65536 is first introduced in -// program #2 (first_task=2) and used for the last time in program #7 -// (last_task=7). +// Record, containing tensor size and IDs of the first and the last task, +// that use this tensor as input or output. For example: tensor #3 with size +// tensor_size=65536 is first introduced in program #2 (first_task=2) and used +// for the last time in program #7 (last_task=7). +template struct TensorUsageRecord { - uint32_t tensor_size; + TensorSizeT tensor_size; TaskId first_task; TaskId last_task; - TensorUsageRecord(uint32_t size, TaskId first, TaskId last) + TensorUsageRecord(TensorSizeT size, TaskId first, TaskId last) : tensor_size(size), first_task(first), last_task(last) {} // Default order of tensor usage records is increasing order of first_task. - bool operator<(const TensorUsageRecord& other) const { + bool operator<(const TensorUsageRecord& other) const { return first_task < other.first_task; } }; // Information about assignment of tensors to shared objects +template struct ObjectsAssignment { // shared_object_ids_[i] is ID of shared object, that tensor i will be using. std::vector object_ids; // shared_object_sizes_[i] is a size of shared object with ID equal to i. - std::vector object_sizes; + std::vector object_sizes; }; enum class MemoryStrategy { @@ -71,10 +72,11 @@ enum class MemoryStrategy { }; // Calculates the assignement of shared objects to given tensors, including -// objects' sizes. +// objects' sizes. Initial tensor sizes are given as size_t. This function is +// intended to use with GPU buffers. Status AssignObjectsToTensors( - const std::vector& usage_records, - const MemoryStrategy& strategy, ObjectsAssignment* assignment); + const std::vector>& usage_records, + const MemoryStrategy& strategy, ObjectsAssignment* assignment); } // namespace gpu } // namespace tflite diff --git a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc index a1484cd0e55..fa263575f3a 100644 --- a/tensorflow/lite/delegates/gpu/common/memory_management_test.cc +++ b/tensorflow/lite/delegates/gpu/common/memory_management_test.cc @@ -25,7 +25,7 @@ namespace { using ::testing::ElementsAre; TEST(Model, EmptyRecords) { - ObjectsAssignment assignment; + ObjectsAssignment assignment; ASSERT_TRUE( AssignObjectsToTensors({}, MemoryStrategy::NAIVE, &assignment).ok()); EXPECT_TRUE(assignment.object_ids.empty()); @@ -42,9 +42,9 @@ TEST(Model, EmptyRecords) { } TEST(Model, OneRecord) { - std::vector usage_records{ + std::vector> usage_records{ {/*size=*/16, /*first=*/0, /*last=*/1}}; - ObjectsAssignment assignment; + ObjectsAssignment assignment; ASSERT_TRUE( AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment) .ok()); @@ -63,14 +63,14 @@ TEST(Model, OneRecord) { } TEST(Model, ChainRecords) { - std::vector usage_records{ + std::vector> usage_records{ {/*size=*/16, /*first=*/0, /*last=*/1}, {/*size=*/8, /*first=*/1, /*last=*/2}, {/*size=*/64, /*first=*/2, /*last=*/3}, {/*size=*/32, /*first=*/3, /*last=*/4}, {/*size=*/8, /*first=*/4, /*last=*/5}, }; - ObjectsAssignment assignment; + ObjectsAssignment assignment; ASSERT_TRUE( AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment) .ok()); @@ -89,7 +89,7 @@ TEST(Model, ChainRecords) { } TEST(Model, ComplexRecords) { - std::vector usage_records{ + std::vector> usage_records{ {/*size=*/32, /*first=*/0, /*last=*/1}, {/*size=*/32, /*first=*/1, /*last=*/4}, {/*size=*/8, /*first=*/2, /*last=*/5}, @@ -99,7 +99,7 @@ TEST(Model, ComplexRecords) { {/*size=*/8, /*first=*/6, /*last=*/8}, {/*size=*/8, /*first=*/7, /*last=*/8}, {/*size=*/16, /*first=*/8, /*last=*/9}}; - ObjectsAssignment assignment; + ObjectsAssignment assignment; ASSERT_TRUE( AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment) .ok()); @@ -111,6 +111,7 @@ TEST(Model, ComplexRecords) { .ok()); EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 0, 2, 3, 1, 3, 2, 0)); EXPECT_THAT(assignment.object_sizes, ElementsAre(32, 64, 16, 8)); + ASSERT_TRUE(AssignObjectsToTensors(usage_records, MemoryStrategy::MINCOSTFLOW, &assignment) .ok()); diff --git a/tensorflow/lite/delegates/gpu/metal/inference_context.mm b/tensorflow/lite/delegates/gpu/metal/inference_context.mm index 720872ad8a6..309e36ebeca 100644 --- a/tensorflow/lite/delegates/gpu/metal/inference_context.mm +++ b/tensorflow/lite/delegates/gpu/metal/inference_context.mm @@ -88,7 +88,7 @@ using ::tflite::gpu::TensorUsageRecord; // TODO(ypisarchyk): it make sense to move it to separate function // Generate usage records for each intermediate tensor in order of their first_task - std::vector usageRecords; + std::vector> usageRecords; std::map usageRecordIds; for (uint32_t i = 0; i < taskDescriptors.size(); ++i) { auto outputId = taskDescriptors[i]->output_buffer.id; @@ -111,7 +111,7 @@ using ::tflite::gpu::TensorUsageRecord; } } - tflite::gpu::ObjectsAssignment assignment; + tflite::gpu::ObjectsAssignment assignment; RETURN_IF_ERROR(AssignObjectsToTensors(usageRecords, MemoryStrategy::GREEDY, &assignment)); auto objectsCount = assignment.object_sizes.size(); std::vector> sharedBuffers(objectsCount);