Allow different tensor size type in TensorUsageRecord and in the naive algorithm.

Use size_t instead of uint32_t in all memory management algorithms.

PiperOrigin-RevId: 253913153
This commit is contained in:
A. Unique TensorFlower 2019-06-18 18:47:42 -07:00 committed by TensorFlower Gardener
parent e982459f83
commit ee21809f1b
4 changed files with 63 additions and 55 deletions

View File

@ -29,7 +29,7 @@ namespace gpu {
namespace {
struct PoolRecord {
PoolRecord(uint32_t size, size_t obj_id)
PoolRecord(size_t size, size_t obj_id)
: object_size(size), object_id(obj_id) {}
// Objects in pool are ordered by size.
@ -38,7 +38,7 @@ struct PoolRecord {
(object_size == other.object_size && object_id < other.object_id);
}
uint32_t object_size;
size_t object_size;
size_t object_id;
};
@ -61,8 +61,10 @@ struct QueueRecord {
//
// The problem of memory management is NP-complete. This implements a
// naive algorithm that assigns each tensor to a separate object in memory.
Status NaiveAssignment(const std::vector<TensorUsageRecord>& usage_records,
ObjectsAssignment* assignment) {
template <typename TensorSizeT>
Status NaiveAssignment(
const std::vector<TensorUsageRecord<TensorSizeT>>& usage_records,
ObjectsAssignment<TensorSizeT>* assignment) {
assignment->object_sizes.resize(usage_records.size());
assignment->object_ids.resize(usage_records.size());
for (size_t i = 0; i < usage_records.size(); i++) {
@ -79,7 +81,8 @@ Status NaiveAssignment(const std::vector<TensorUsageRecord>& usage_records,
// greedy algorithm that approximates an optimal solution with following
// heuristic:
//
// 1. Iterates through all tensor usage records and for every object reference
// 1. Iterates through all tensor usage records and for every object
// reference
// assigns shared object from the pool. When object reference is used
// for the last time, corresponding shared object is returned back to
// the pool.
@ -88,8 +91,9 @@ Status NaiveAssignment(const std::vector<TensorUsageRecord>& usage_records,
// available.
//
// 3. Shared object size may increase when tensor requests larger size.
Status GreedyAssignment(const std::vector<TensorUsageRecord>& usage_records,
ObjectsAssignment* assignment) {
Status GreedyAssignment(
const std::vector<TensorUsageRecord<size_t>>& usage_records,
ObjectsAssignment<size_t>* assignment) {
assignment->object_sizes.clear();
assignment->object_ids.resize(usage_records.size());
@ -108,7 +112,7 @@ Status GreedyAssignment(const std::vector<TensorUsageRecord>& usage_records,
pool.insert({assignment->object_sizes[object_id], object_id});
objects_in_use.pop();
}
uint32_t tensor_size = usage_records[i].tensor_size;
size_t tensor_size = usage_records[i].tensor_size;
if (pool.empty()) {
// No free shared object, creating a new one, assign i-th tensor to
// it and add to the queue of objects in use.
@ -121,7 +125,7 @@ Status GreedyAssignment(const std::vector<TensorUsageRecord>& usage_records,
// Find shared object from pool, that will waste the least possible
// amount of memory when reused for current tensor.
auto pool_it = pool.lower_bound({tensor_size, 0});
uint32_t size_diff = 0;
size_t size_diff = 0;
if (pool_it != pool.end()) {
// Try smallest shared object from pool with size >= tensor_size.
size_diff = pool_it->object_size - tensor_size;
@ -139,7 +143,8 @@ Status GreedyAssignment(const std::vector<TensorUsageRecord>& usage_records,
// best_it can't be equal to pool.end(), because pool is not empty
if (best_it == pool.end()) {
return InternalError(
"No shared object is found in non-empty pool in GreedyAssignment.");
"No shared object is found in non-empty pool in "
"GreedyAssignment.");
}
size_t shared_id = best_it->object_id;
pool.erase(best_it);
@ -158,7 +163,7 @@ class MinCostFlowSolver {
public:
// Build auxiliary flow graph, based on information about intermediate
// tensors.
void Build(const std::vector<TensorUsageRecord>& usage_records) {
void Build(const std::vector<TensorUsageRecord<size_t>>& usage_records) {
usage_records_ = &usage_records;
num_tensors_ = usage_records.size();
source_ = 2 * num_tensors_;
@ -167,8 +172,8 @@ class MinCostFlowSolver {
std::vector<size_t> old_record_ids;
std::priority_queue<QueueRecord> objects_in_use;
for (size_t i = 0; i < usage_records.size(); i++) {
// Pop from the queue all objects that are no longer in use at the time of
// execution of the first_task of i-th intermediate tensor.
// Pop from the queue all objects that are no longer in use at the time
// of execution of the first_task of i-th intermediate tensor.
while (!objects_in_use.empty() &&
objects_in_use.top().last_task < usage_records[i].first_task) {
old_record_ids.push_back(objects_in_use.top().object_id);
@ -186,8 +191,8 @@ class MinCostFlowSolver {
// Edges from vertices of the left part of flow graph, corresponding to
// old_record_ids, to i-th vertex in the right part of flow graph are
// added for the case of reusing previously created shared objects for
// i-th tensor. Cost of these edges is an approximation of the size of new
// allocated memory.
// i-th tensor. Cost of these edges is an approximation of the size of
// new allocated memory.
for (auto record_id : old_record_ids) {
int cost = 0;
if (usage_records[i].tensor_size >
@ -251,7 +256,7 @@ class MinCostFlowSolver {
}
}
void CalculateAssignment(ObjectsAssignment* assignment) {
void CalculateAssignment(ObjectsAssignment<size_t>* assignment) {
assignment->object_sizes.clear();
assignment->object_ids.resize(num_tensors_);
is_tensor_assigned_.resize(num_tensors_);
@ -273,9 +278,9 @@ class MinCostFlowSolver {
int cost;
};
// Add edge from vertex src to vertex dst with given capacity and cost and its
// reversed edge to the flow graph. If some edge has index idx, its reversed
// edge has index idx^1.
// Add edge from vertex src to vertex dst with given capacity and cost and
// its reversed edge to the flow graph. If some edge has index idx, its
// reversed edge has index idx^1.
void AddEdge(size_t src, size_t dst, int cap, int cost) {
edges_from_[src].push_back(edges_.size());
edges_.emplace_back(dst, cap, cost);
@ -288,8 +293,8 @@ class MinCostFlowSolver {
return vertex_id >= num_tensors_ && vertex_id < 2 * num_tensors_;
}
// Return vertex from another part of the graph, that corresponds to the same
// intermediate tensor.
// Return vertex from another part of the graph, that corresponds to the
// same intermediate tensor.
size_t LeftPartTwin(size_t vertex_id) const {
return vertex_id - num_tensors_;
}
@ -299,13 +304,13 @@ class MinCostFlowSolver {
// This function uses recursive implementation of depth-first search and
// returns maximum size from tensor tensor_id and all tensors, that will be
// allocated at the same place with it after all operations that use tensor_id
// are executed. Next tensor to be allocated at the same place with tensor_id
// is a left part twin of such vertex v, that the edge tensor_id->v is
// saturated (has zero residual capacity).
uint32_t AssignTensorsToNewSharedObject(size_t tensor_id,
ObjectsAssignment* assignment) {
uint32_t cost = (*usage_records_)[tensor_id].tensor_size;
// allocated at the same place with it after all operations that use
// tensor_id are executed. Next tensor to be allocated at the same place
// with tensor_id is a left part twin of such vertex v, that the edge
// tensor_id->v is saturated (has zero residual capacity).
size_t AssignTensorsToNewSharedObject(size_t tensor_id,
ObjectsAssignment<size_t>* assignment) {
size_t cost = (*usage_records_)[tensor_id].tensor_size;
is_tensor_assigned_[tensor_id] = true;
assignment->object_ids[tensor_id] = assignment->object_sizes.size();
for (const auto& edge_id : edges_from_[tensor_id]) {
@ -324,7 +329,7 @@ class MinCostFlowSolver {
size_t source_;
size_t sink_;
size_t num_tensors_;
const std::vector<TensorUsageRecord>* usage_records_;
const std::vector<TensorUsageRecord<size_t>>* usage_records_;
std::vector<Edge> edges_;
std::vector<std::vector<size_t>> edges_from_;
std::vector<bool> is_tensor_assigned_;
@ -337,8 +342,8 @@ class MinCostFlowSolver {
// assignment of shared objects to tensors, using the result of the flow
// algorithm.
Status MinCostFlowAssignment(
const std::vector<TensorUsageRecord>& usage_records,
ObjectsAssignment* assignment) {
const std::vector<TensorUsageRecord<size_t>>& usage_records,
ObjectsAssignment<size_t>* assignment) {
MinCostFlowSolver solver;
solver.Build(usage_records);
solver.Solve();
@ -349,11 +354,11 @@ Status MinCostFlowAssignment(
} // namespace
Status AssignObjectsToTensors(
const std::vector<TensorUsageRecord>& usage_records,
const MemoryStrategy& strategy, ObjectsAssignment* assignment) {
const std::vector<TensorUsageRecord<size_t>>& usage_records,
const MemoryStrategy& strategy, ObjectsAssignment<size_t>* assignment) {
switch (strategy) {
case MemoryStrategy::NAIVE:
return NaiveAssignment(usage_records, assignment);
return NaiveAssignment<size_t>(usage_records, assignment);
case MemoryStrategy::GREEDY:
return GreedyAssignment(usage_records, assignment);
case MemoryStrategy::MINCOSTFLOW:

View File

@ -28,31 +28,32 @@ namespace gpu {
using TaskId = size_t;
// Record, containing tensor size and IDs of the first and the last task, that
// use this tensor as input or output.
// For example: tensor #3 with size tensor_size=65536 is first introduced in
// program #2 (first_task=2) and used for the last time in program #7
// (last_task=7).
// Record, containing tensor size and IDs of the first and the last task,
// that use this tensor as input or output. For example: tensor #3 with size
// tensor_size=65536 is first introduced in program #2 (first_task=2) and used
// for the last time in program #7 (last_task=7).
template <typename TensorSizeT>
struct TensorUsageRecord {
uint32_t tensor_size;
TensorSizeT tensor_size;
TaskId first_task;
TaskId last_task;
TensorUsageRecord(uint32_t size, TaskId first, TaskId last)
TensorUsageRecord(TensorSizeT size, TaskId first, TaskId last)
: tensor_size(size), first_task(first), last_task(last) {}
// Default order of tensor usage records is increasing order of first_task.
bool operator<(const TensorUsageRecord& other) const {
bool operator<(const TensorUsageRecord<TensorSizeT>& other) const {
return first_task < other.first_task;
}
};
// Information about assignment of tensors to shared objects
template <typename TensorSizeT>
struct ObjectsAssignment {
// shared_object_ids_[i] is ID of shared object, that tensor i will be using.
std::vector<size_t> object_ids;
// shared_object_sizes_[i] is a size of shared object with ID equal to i.
std::vector<uint32_t> object_sizes;
std::vector<TensorSizeT> object_sizes;
};
enum class MemoryStrategy {
@ -71,10 +72,11 @@ enum class MemoryStrategy {
};
// Calculates the assignement of shared objects to given tensors, including
// objects' sizes.
// objects' sizes. Initial tensor sizes are given as size_t. This function is
// intended to use with GPU buffers.
Status AssignObjectsToTensors(
const std::vector<TensorUsageRecord>& usage_records,
const MemoryStrategy& strategy, ObjectsAssignment* assignment);
const std::vector<TensorUsageRecord<size_t>>& usage_records,
const MemoryStrategy& strategy, ObjectsAssignment<size_t>* assignment);
} // namespace gpu
} // namespace tflite

View File

@ -25,7 +25,7 @@ namespace {
using ::testing::ElementsAre;
TEST(Model, EmptyRecords) {
ObjectsAssignment assignment;
ObjectsAssignment<size_t> assignment;
ASSERT_TRUE(
AssignObjectsToTensors({}, MemoryStrategy::NAIVE, &assignment).ok());
EXPECT_TRUE(assignment.object_ids.empty());
@ -42,9 +42,9 @@ TEST(Model, EmptyRecords) {
}
TEST(Model, OneRecord) {
std::vector<TensorUsageRecord> usage_records{
std::vector<TensorUsageRecord<size_t>> usage_records{
{/*size=*/16, /*first=*/0, /*last=*/1}};
ObjectsAssignment assignment;
ObjectsAssignment<size_t> assignment;
ASSERT_TRUE(
AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment)
.ok());
@ -63,14 +63,14 @@ TEST(Model, OneRecord) {
}
TEST(Model, ChainRecords) {
std::vector<TensorUsageRecord> usage_records{
std::vector<TensorUsageRecord<size_t>> usage_records{
{/*size=*/16, /*first=*/0, /*last=*/1},
{/*size=*/8, /*first=*/1, /*last=*/2},
{/*size=*/64, /*first=*/2, /*last=*/3},
{/*size=*/32, /*first=*/3, /*last=*/4},
{/*size=*/8, /*first=*/4, /*last=*/5},
};
ObjectsAssignment assignment;
ObjectsAssignment<size_t> assignment;
ASSERT_TRUE(
AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment)
.ok());
@ -89,7 +89,7 @@ TEST(Model, ChainRecords) {
}
TEST(Model, ComplexRecords) {
std::vector<TensorUsageRecord> usage_records{
std::vector<TensorUsageRecord<size_t>> usage_records{
{/*size=*/32, /*first=*/0, /*last=*/1},
{/*size=*/32, /*first=*/1, /*last=*/4},
{/*size=*/8, /*first=*/2, /*last=*/5},
@ -99,7 +99,7 @@ TEST(Model, ComplexRecords) {
{/*size=*/8, /*first=*/6, /*last=*/8},
{/*size=*/8, /*first=*/7, /*last=*/8},
{/*size=*/16, /*first=*/8, /*last=*/9}};
ObjectsAssignment assignment;
ObjectsAssignment<size_t> assignment;
ASSERT_TRUE(
AssignObjectsToTensors(usage_records, MemoryStrategy::NAIVE, &assignment)
.ok());
@ -111,6 +111,7 @@ TEST(Model, ComplexRecords) {
.ok());
EXPECT_THAT(assignment.object_ids, ElementsAre(0, 1, 0, 2, 3, 1, 3, 2, 0));
EXPECT_THAT(assignment.object_sizes, ElementsAre(32, 64, 16, 8));
ASSERT_TRUE(AssignObjectsToTensors(usage_records, MemoryStrategy::MINCOSTFLOW,
&assignment)
.ok());

View File

@ -88,7 +88,7 @@ using ::tflite::gpu::TensorUsageRecord;
// TODO(ypisarchyk): it make sense to move it to separate function
// Generate usage records for each intermediate tensor in order of their first_task
std::vector<TensorUsageRecord> usageRecords;
std::vector<TensorUsageRecord<size_t>> usageRecords;
std::map<ValueId, size_t> usageRecordIds;
for (uint32_t i = 0; i < taskDescriptors.size(); ++i) {
auto outputId = taskDescriptors[i]->output_buffer.id;
@ -111,7 +111,7 @@ using ::tflite::gpu::TensorUsageRecord;
}
}
tflite::gpu::ObjectsAssignment assignment;
tflite::gpu::ObjectsAssignment<size_t> assignment;
RETURN_IF_ERROR(AssignObjectsToTensors(usageRecords, MemoryStrategy::GREEDY, &assignment));
auto objectsCount = assignment.object_sizes.size();
std::vector<id<MTLBuffer>> sharedBuffers(objectsCount);