Allow different tensor size type in TensorUsageRecord and in the naive algorithm.
Use size_t instead of uint32_t in all memory management algorithms. PiperOrigin-RevId: 253913153
This commit is contained in:
parent
e982459f83
commit
ee21809f1b
@ -29,7 +29,7 @@ namespace gpu {
|
||||
namespace {
|
||||
|
||||
struct PoolRecord {
|
||||
PoolRecord(uint32_t size, size_t obj_id)
|
||||
PoolRecord(size_t size, size_t obj_id)
|
||||
: object_size(size), object_id(obj_id) {}
|
||||
|
||||
// Objects in pool are ordered by size.
|
||||
@ -38,7 +38,7 @@ struct PoolRecord {
|
||||
(object_size == other.object_size && object_id < other.object_id);
|
||||
}
|
||||
|
||||
uint32_t object_size;
|
||||
size_t object_size;
|
||||
size_t object_id;
|
||||
};
|
||||
|
||||
@ -61,8 +61,10 @@ struct QueueRecord {
|
||||
//
|
||||
// The problem of memory management is NP-complete. This implements a
|
||||
// naive algorithm that assigns each tensor to a separate object in memory.
|
||||
Status NaiveAssignment(const std::vector<TensorUsageRecord>& usage_records,
|
||||
ObjectsAssignment* assignment) {
|
||||
template <typename TensorSizeT>
|
||||
Status NaiveAssignment(
|
||||
const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
|
||||
ObjectsAssignment<TensorSizeT>* assignment) {
|
||||
assignment->object_sizes.resize(usage_records.size());
|
||||
assignment->object_ids.resize(usage_records.size());
|
||||
for (size_t i = 0; i < usage_records.size(); i++) {
|
||||
@ -79,7 +81,8 @@ Status NaiveAssignment(const std::vector<TensorUsageRecord>& usage_records,
|
||||
// greedy algorithm that approximates an optimal solution with following
|
||||
// heuristic:
|
||||
//
|
||||
// 1. Iterates through all tensor usage records and for every object reference
|
||||
// 1. Iterates through all tensor usage records and for every object
|
||||
// reference
|
||||
// assigns shared object from the pool. When object reference is used
|
||||
// for the last time, corresponding shared object is returned back to
|
||||
// the pool.
|
||||
@ -88,8 +91,9 @@ Status NaiveAssignment(const std::vector<TensorUsageRecord>& usage_records,
|
||||
// available.
|
||||
//
|
||||
// 3. Shared object size may increase when tensor requests larger size.
|
||||
Status GreedyAssignment(const std::vector<TensorUsageRecord>& usage_records,
|
||||
ObjectsAssignment* assignment) {
|
||||
Status GreedyAssignment(
|
||||
const std::vector<TensorUsageRecord<size_t>>& usage_records,
|
||||
ObjectsAssignment<size_t>* assignment) {
|
||||
assignment->object_sizes.clear();
|
||||
assignment->object_ids.resize(usage_records.size());
|
||||
|
||||
@ -108,7 +112,7 @@ Status GreedyAssignment(const std::vector<TensorUsageRecord>& usage_records,
|
||||
pool.insert({assignment->object_sizes[object_id], object_id});
|
||||
objects_in_use.pop();
|
||||
}
|
||||
uint32_t tensor_size = usage_records[i].tensor_size;
|
||||
size_t tensor_size = usage_records[i].tensor_size;
|
||||
if (pool.empty()) {
|
||||
// No free shared object, creating a new one, assign i-th tensor to
|
||||
// it and add to the queue of objects in use.
|
||||
@ -121,7 +125,7 @@ Status GreedyAssignment(const std::vector<TensorUsageRecord>& usage_records,
|
||||
// Find shared object from pool, that will waste the least possible
|
||||
// amount of memory when reused for current tensor.
|
||||
auto pool_it = pool.lower_bound({tensor_size, 0});
|
||||
uint32_t size_diff = 0;
|
||||
size_t size_diff = 0;
|
||||
if (pool_it != pool.end()) {
|
||||
// Try smallest shared object from pool with size >= tensor_size.
|
||||
size_diff = pool_it->object_size - tensor_size;
|
||||
@ -139,7 +143,8 @@ Status GreedyAssignment(const std::vector<TensorUsageRecord>& usage_records,
|
||||
// best_it can't be equal to pool.end(), because pool is not empty
|
||||
if (best_it == pool.end()) {
|
||||
return InternalError(
|
||||
"No shared object is found in non-empty pool in GreedyAssignment.");
|
||||
"No shared object is found in non-empty pool in "
|
||||
"GreedyAssignment.");
|
||||
}
|
||||
size_t shared_id = best_it->object_id;
|
||||
pool.erase(best_it);
|
||||
@ -158,7 +163,7 @@ class MinCostFlowSolver {
|
||||
public:
|
||||
// Build auxiliary flow graph, based on information about intermediate
|
||||
// tensors.
|
||||
void Build(const std::vector<TensorUsageRecord>& usage_records) {
|
||||
void Build(const std::vector<TensorUsageRecord<size_t>>& usage_records) {
|
||||
usage_records_ = &usage_records;
|
||||
num_tensors_ = usage_records.size();
|
||||
source_ = 2 * num_tensors_;
|
||||
@ -167,8 +172,8 @@ class MinCostFlowSolver {
|
||||
std::vector<size_t> old_record_ids;
|
||||
std::priority_queue<QueueRecord> objects_in_use;
|
||||
for (size_t i = 0; i < usage_records.size(); i++) {
|
||||
// Pop from the queue all objects that are no longer in use at the time of
|
||||
// execution of the first_task of i-th intermediate tensor.
|
||||
// Pop from the queue all objects that are no longer in use at the time
|
||||
// of execution of the first_task of i-th intermediate tensor.
|
||||
while (!objects_in_use.empty() &&
|
||||
objects_in_use.top().last_task < usage_records[i].first_task) {
|
||||
old_record_ids.push_back(objects_in_use.top().object_id);
|
||||
@ -186,8 +191,8 @@ class MinCostFlowSolver {
|
||||
// Edges from vertices of the left part of flow graph, corresponding to
|
||||
// old_record_ids, to i-th vertex in the right part of flow graph are
|
||||
// added for the case of reusing previously created shared objects for
|
||||
// i-th tensor. Cost of these edges is an approximation of the size of new
|
||||
// allocated memory.
|
||||
// i-th tensor. Cost of these edges is an approximation of the size of
|
||||
// new allocated memory.
|
||||
for (auto record_id : old_record_ids) {
|
||||
int cost = 0;
|
||||
if (usage_records[i].tensor_size >
|
||||
@ -251,7 +256,7 @@ class MinCostFlowSolver {
|
||||
}
|
||||
}
|
||||
|
||||
void CalculateAssignment(ObjectsAssignment* assignment) {
|
||||
void CalculateAssignment(ObjectsAssignment<size_t>* assignment) {
|
||||
assignment->object_sizes.clear();
|
||||
assignment->object_ids.resize(num_tensors_);
|
||||
is_tensor_assigned_.resize(num_tensors_);
|
||||
@ -273,9 +278,9 @@ class MinCostFlowSolver {
|
||||
int cost;
|
||||
};
|
||||
|
||||
// Add edge from vertex src to vertex dst with given capacity and cost and its
|
||||
// reversed edge to the flow graph. If some edge has index idx, its reversed
|
||||
// edge has index idx^1.
|
||||
// Add edge from vertex src to vertex dst with given capacity and cost and
|
||||
// its reversed edge to the flow graph. If some edge has index idx, its
|
||||
// reversed edge has index idx^1.
|
||||
void AddEdge(size_t src, size_t dst, int cap, int cost) {
|
||||
edges_from_[src].push_back(edges_.size());
|
||||
edges_.emplace_back(dst, cap, cost);
|
||||
@ -288,8 +293,8 @@ class MinCostFlowSolver {
|
||||
return vertex_id >= num_tensors_ && vertex_id < 2 * num_tensors_;
|
||||
}
|
||||
|
||||
// Return vertex from another part of the graph, that corresponds to the same
|
||||
// intermediate tensor.
|
||||
// Return vertex from another part of the graph, that corresponds to the
|
||||
// same intermediate tensor.
|
||||
size_t LeftPartTwin(size_t vertex_id) const {
|
||||
return vertex_id - num_tensors_;
|
||||
}
|
||||
@ -299,13 +304,13 @@ class MinCostFlowSolver {
|
||||
|
||||
// This function uses recursive implementation of depth-first search and
|
||||
// returns maximum size from tensor tensor_id and all tensors, that will be
|
||||
// allocated at the same place with it after all operations that use tensor_id
|
||||
// are executed. Next tensor to be allocated at the same place with tensor_id
|
||||
// is a left part twin of such vertex v, that the edge tensor_id->v is
|
||||
// saturated (has zero residual capacity).
|
||||
uint32_t AssignTensorsToNewSharedObject(size_t tensor_id,
|
||||
ObjectsAssignment* assignment) {
|
||||
uint32_t cost = (*usage_records_)[tensor_id].tensor_size;
|
||||
// allocated at the same place with it after all operations that use
|
||||
// tensor_id are executed. Next tensor to be allocated at the same place
|
||||
// with tensor_id is a left part twin of such vertex v, that the edge
|
||||
// tensor_id->v is saturated (has zero residual capacity).
|
||||
size_t AssignTensorsToNewSharedObject(size_t tensor_id,
|
||||
ObjectsAssignment<size_t>* assignment) {
|
||||
size_t cost = (*usage_records_)[tensor_id].tensor_size;
|
||||
is_tensor_assigned_[tensor_id] = true;
|
||||
assignment->object_ids[tensor_id] = assignment->object_sizes.size();
|
||||
for (const auto& edge_id : edges_from_[tensor_id]) {
|
||||
@ -324,7 +329,7 @@ class MinCostFlowSolver {
|
||||
size_t source_;
|
||||
size_t sink_;
|
||||
size_t num_tensors_;
|
||||
const std::vector<TensorUsageRecord>* usage_records_;
|
||||
const std::vector<TensorUsageRecord<size_t>>* usage_records_;
|
||||
std::vector<Edge> edges_;
|
||||
std::vector<std::vector<size_t>> edges_from_;
|
||||
std::vector<bool> is_tensor_assigned_;
|
||||
@ -337,8 +342,8 @@ class MinCostFlowSolver {
|
||||
// assignment of shared objects to tensors, using the result of the flow
|
||||
// algorithm.
|
||||
Status MinCostFlowAssignment(
|
||||
const std::vector<TensorUsageRecord>& usage_records,
|
||||
ObjectsAssignment* assignment) {
|
||||
const std::vector<TensorUsageRecord<size_t>>& usage_records,
|
||||
ObjectsAssignment<size_t>* assignment) {
|
||||
MinCostFlowSolver solver;
|
||||
solver.Build(usage_records);
|
||||
solver.Solve();
|
||||
@ -349,11 +354,11 @@ Status MinCostFlowAssignment(
|
||||
} // namespace
|
||||
|
||||
Status AssignObjectsToTensors(
|
||||
const std::vector<TensorUsageRecord>& usage_records,
|
||||
const MemoryStrategy& strategy, ObjectsAssignment* assignment) {
|
||||
const std::vector<TensorUsageRecord<size_t>>& usage_records,
|
||||
const MemoryStrategy& strategy, ObjectsAssignment<size_t>* assignment) {
|
||||
switch (strategy) {
|
||||
case MemoryStrategy::NAIVE:
|
||||
return NaiveAssignment(usage_records, assignment);
|
||||
return NaiveAssignment<size_t>(usage_records, assignment);
|
||||
case MemoryStrategy::GREEDY:
|
||||
return GreedyAssignment(usage_records, assignment);
|
||||
case MemoryStrategy::MINCOSTFLOW:
|
||||
|
@ -28,31 +28,32 @@ namespace gpu {
|
||||
|
||||
using TaskId = size_t;
|
||||
|
||||
// Record, containing tensor size and IDs of the first and the last task, that
|
||||
// use this tensor as input or output.
|
||||
// For example: tensor #3 with size tensor_size=65536 is first introduced in
|
||||
// program #2 (first_task=2) and used for the last time in program #7
|
||||
// (last_task=7).
|
||||
// Record, containing tensor size and IDs of the first and the last task,
|
||||
// that use this tensor as input or output. For example: tensor #3 with size
|
||||
// tensor_size=65536 is first introduced in program #2 (first_task=2) and used
|
||||
// for the last time in program #7 (last_task=7).
|
||||
template <typename TensorSizeT>
|
||||
struct TensorUsageRecord {
|
||||
uint32_t tensor_size;
|
||||
TensorSizeT tensor_size;
|
||||
TaskId first_task;
|
||||
TaskId last_task;
|
||||
|
||||
TensorUsageRecord(uint32_t size, TaskId first, TaskId last)
|
||||
TensorUsageRecord(TensorSizeT size, TaskId first, TaskId last)
|
||||
: tensor_size(size), first_task(first), last_task(last) {}
|
||||
|
||||
// Default order of tensor usage records is increasing order of first_task.
|
||||
bool operator<(const TensorUsageRecord& other) const {
|
||||
bool operator<(const TensorUsageRecord<TensorSizeT>& other) const {
|
||||
return first_task < other.first_task;
|
||||
}
|
||||
};
|
||||
|
||||
// Information about assignment of tensors to shared objects
|
||||
template <typename TensorSizeT>
|
||||
struct ObjectsAssignment {
|
||||
// shared_object_ids_[i] is ID of shared object, that tensor i will be using.
|
||||
std::vector<size_t> object_ids;
|
||||
// shared_object_sizes_[i] is a size of shared object with ID equal to i.
|
||||
std::vector<uint32_t> object_sizes;
|
||||
std::vector<TensorSizeT> object_sizes;
|
||||
};
|
||||
|
||||
enum class MemoryStrategy {
|
||||
@ -71,10 +72,11 @@ enum class MemoryStrategy {
|
||||
};
|
||||
|
||||
// Calculates the assignement of shared objects to given tensors, including
|
||||
// objects' sizes.
|
||||
// objects' sizes. Initial tensor sizes are given as size_t. This function is
|
||||
// intended to use with GPU buffers.
|
||||
Status AssignObjectsToTensors(
|
||||
const std::vector<TensorUsageRecord>& usage_records,
|
||||
const MemoryStrategy& strategy, ObjectsAssignment* assignment);
|
||||
const std::vector<TensorUsageRecord<size_t>>& usage_records,
|
||||
const MemoryStrategy& strategy, ObjectsAssignment<size_t>* assignment);
|
||||
|
||||
} // namespace gpu
|
||||
} // namespace tflite
|
||||
|
@ -25,7 +25,7 @@ namespace {
|
||||
using ::testing::ElementsAre;
|
||||
|
||||
TEST(Model, EmptyRecords) {
|
||||
ObjectsAssignment assignment;
|
||||
ObjectsAssignment<size_t> assignment;
|
||||
ASSERT_TRUE(
|
||||
AssignObjectsToTensors({}, MemoryStrategy::NAIVE, &assignment).ok());
|
||||
EXPECT_TRUE(assignment.object_ids.empty());
|
||||
@ -42,9 +42,9 @@ TEST(Model, EmptyRecords) {
|
||||
}
|
||||
|
||||
TEST(Model, OneRecord) {
|
||||
std::vector<TensorUsageRecord> usage_records{
|
||||
std::vector<TensorUsageRecord<size_t>> usage_records{
|
||||
{/*size=*/16, /*first=*/0, /*last=*/1}};
|
||||
ObjectsAssignment assignment;
|
||||
ObjectsAssignment<size_t> assignment;
|
||||
ASSERT_TRUE(
|
||||
AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment)
|
||||
.ok());
|
||||
@ -63,14 +63,14 @@ TEST(Model, OneRecord) {
|
||||
}
|
||||
|
||||
TEST(Model, ChainRecords) {
|
||||
std::vector<TensorUsageRecord> usage_records{
|
||||
std::vector<TensorUsageRecord<size_t>> usage_records{
|
||||
{/*size=*/16, /*first=*/0, /*last=*/1},
|
||||
{/*size=*/8, /*first=*/1, /*last=*/2},
|
||||
{/*size=*/64, /*first=*/2, /*last=*/3},
|
||||
{/*size=*/32, /*first=*/3, /*last=*/4},
|
||||
{/*size=*/8, /*first=*/4, /*last=*/5},
|
||||
};
|
||||
ObjectsAssignment assignment;
|
||||
ObjectsAssignment<size_t> assignment;
|
||||
ASSERT_TRUE(
|
||||
AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment)
|
||||
.ok());
|
||||
@ -89,7 +89,7 @@ TEST(Model, ChainRecords) {
|
||||
}
|
||||
|
||||
TEST(Model, ComplexRecords) {
|
||||
std::vector<TensorUsageRecord> usage_records{
|
||||
std::vector<TensorUsageRecord<size_t>> usage_records{
|
||||
{/*size=*/32, /*first=*/0, /*last=*/1},
|
||||
{/*size=*/32, /*first=*/1, /*last=*/4},
|
||||
{/*size=*/8, /*first=*/2, /*last=*/5},
|
||||
@ -99,7 +99,7 @@ TEST(Model, ComplexRecords) {
|
||||
{/*size=*/8, /*first=*/6, /*last=*/8},
|
||||
{/*size=*/8, /*first=*/7, /*last=*/8},
|
||||
{/*size=*/16, /*first=*/8, /*last=*/9}};
|
||||
ObjectsAssignment assignment;
|
||||
ObjectsAssignment<size_t> assignment;
|
||||
ASSERT_TRUE(
|
||||
AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment)
|
||||
.ok());
|
||||
@ -111,6 +111,7 @@ TEST(Model, ComplexRecords) {
|
||||
.ok());
|
||||
EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 0, 2, 3, 1, 3, 2, 0));
|
||||
EXPECT_THAT(assignment.object_sizes, ElementsAre(32, 64, 16, 8));
|
||||
|
||||
ASSERT_TRUE(AssignObjectsToTensors(usage_records, MemoryStrategy::MINCOSTFLOW,
|
||||
&assignment)
|
||||
.ok());
|
||||
|
@ -88,7 +88,7 @@ using ::tflite::gpu::TensorUsageRecord;
|
||||
|
||||
// TODO(ypisarchyk): it make sense to move it to separate function
|
||||
// Generate usage records for each intermediate tensor in order of their first_task
|
||||
std::vector<TensorUsageRecord> usageRecords;
|
||||
std::vector<TensorUsageRecord<size_t>> usageRecords;
|
||||
std::map<ValueId, size_t> usageRecordIds;
|
||||
for (uint32_t i = 0; i < taskDescriptors.size(); ++i) {
|
||||
auto outputId = taskDescriptors[i]->output_buffer.id;
|
||||
@ -111,7 +111,7 @@ using ::tflite::gpu::TensorUsageRecord;
|
||||
}
|
||||
}
|
||||
|
||||
tflite::gpu::ObjectsAssignment assignment;
|
||||
tflite::gpu::ObjectsAssignment<size_t> assignment;
|
||||
RETURN_IF_ERROR(AssignObjectsToTensors(usageRecords, MemoryStrategy::GREEDY, &assignment));
|
||||
auto objectsCount = assignment.object_sizes.size();
|
||||
std::vector<id<MTLBuffer>> sharedBuffers(objectsCount);
|
||||
|
Loading…
Reference in New Issue
Block a user