Add memory stats profiling in BFCAllocator.
PiperOrigin-RevId: 289694152 Change-Id: I67792021ec90e67f5ccd47d391bf11ef36ff23ed
This commit is contained in:
parent
9b58a04025
commit
cb2ca1e812
@ -2779,7 +2779,9 @@ cc_library(
|
||||
":protos_all_cc",
|
||||
":shared_counter",
|
||||
"//tensorflow/core/framework:allocator",
|
||||
"//tensorflow/core/profiler/lib:traceme",
|
||||
"@com_google_absl//absl/container:flat_hash_set",
|
||||
"@com_google_absl//absl/strings",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -17,6 +17,7 @@ limitations under the License.
|
||||
|
||||
#include <atomic>
|
||||
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tensorflow/core/common_runtime/allocator_retry.h"
|
||||
#include "tensorflow/core/lib/core/bits.h"
|
||||
#include "tensorflow/core/lib/strings/numbers.h"
|
||||
@ -29,6 +30,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/platform/stacktrace.h"
|
||||
#endif
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/profiler/lib/traceme.h"
|
||||
#include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
|
||||
|
||||
namespace tensorflow {
|
||||
@ -380,6 +382,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
||||
}
|
||||
void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
||||
if (ptr != nullptr) {
|
||||
AddTraceMe("MemoryAllocation");
|
||||
return ptr;
|
||||
}
|
||||
|
||||
@ -387,6 +390,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
||||
if (Extend(unused_alignment, rounded_bytes)) {
|
||||
ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
||||
if (ptr != nullptr) {
|
||||
AddTraceMe("MemoryAllocation");
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
@ -399,6 +403,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
||||
if (MergeTimestampedChunks(rounded_bytes)) {
|
||||
ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
||||
if (ptr != nullptr) {
|
||||
AddTraceMe("MemoryAllocation");
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
@ -412,6 +417,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
||||
Extend(unused_alignment, rounded_bytes)) {
|
||||
ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
||||
if (ptr != nullptr) {
|
||||
AddTraceMe("MemoryAllocation");
|
||||
return ptr;
|
||||
}
|
||||
}
|
||||
@ -435,6 +441,24 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
void BFCAllocator::AddTraceMe(absl::string_view traceme_name) {
|
||||
tensorflow::profiler::TraceMe trace_me(
|
||||
[&]() EXCLUSIVE_LOCKS_REQUIRED(lock_) {
|
||||
AllocatorStats stats = stats_;
|
||||
double fragmentation = GetFragmentation();
|
||||
int64 bytes_available =
|
||||
memory_limit_ - stats.bytes_reserved - stats.bytes_in_use;
|
||||
return absl::StrCat(traceme_name, "#allocator_name=", name_,
|
||||
",bytes_reserved=", stats.bytes_reserved,
|
||||
",bytes_allocated=", stats.bytes_in_use,
|
||||
",bytes_available=", bytes_available,
|
||||
",fragmentation=", fragmentation,
|
||||
",peak_bytes_in_use=", stats.peak_bytes_in_use,
|
||||
"#");
|
||||
},
|
||||
/*level=*/2);
|
||||
}
|
||||
|
||||
void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
|
||||
size_t num_bytes, uint64 freed_before) {
|
||||
// First identify the first bin that could satisfy rounded_bytes.
|
||||
@ -580,6 +604,8 @@ void BFCAllocator::DeallocateRawInternal(void* ptr) {
|
||||
if (VLOG_IS_ON(4)) {
|
||||
LOG(INFO) << "F: " << RenderOccupancy();
|
||||
}
|
||||
|
||||
AddTraceMe("MemoryDeallocation");
|
||||
}
|
||||
|
||||
// Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
|
||||
@ -1009,8 +1035,6 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
|
||||
mas->set_bytes_in_use(stats_.bytes_in_use);
|
||||
mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use);
|
||||
mas->set_largest_alloc_size(stats_.largest_alloc_size);
|
||||
int64 largest_free_chunk = 0;
|
||||
int64 free_bytes = 0;
|
||||
|
||||
// Record summary data for every bin.
|
||||
const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
|
||||
@ -1046,21 +1070,11 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
|
||||
if (timing_counter_) {
|
||||
mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
|
||||
}
|
||||
if (!c->in_use()) {
|
||||
free_bytes += c->size;
|
||||
if (c->size > largest_free_chunk) {
|
||||
largest_free_chunk = c->size;
|
||||
}
|
||||
}
|
||||
h = c->next;
|
||||
}
|
||||
}
|
||||
double frag_metric = 0.0;
|
||||
if (free_bytes > 0) {
|
||||
frag_metric =
|
||||
(free_bytes - largest_free_chunk) / static_cast<double>(free_bytes);
|
||||
}
|
||||
mas->set_fragmentation_metric(frag_metric);
|
||||
|
||||
mas->set_fragmentation_metric(GetFragmentation());
|
||||
|
||||
#ifdef TENSORFLOW_MEM_DEBUG
|
||||
// Record the recent size history
|
||||
@ -1077,6 +1091,31 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
|
||||
return md;
|
||||
}
|
||||
|
||||
double BFCAllocator::GetFragmentation() {
|
||||
int64 largest_free_chunk = 0;
|
||||
int64 free_bytes = 0;
|
||||
for (const auto& region : region_manager_.regions()) {
|
||||
ChunkHandle chunk_handle = region_manager_.get_handle(region.ptr());
|
||||
while (chunk_handle != kInvalidChunkHandle) {
|
||||
const Chunk* chunk = ChunkFromHandle(chunk_handle);
|
||||
if (!chunk->in_use()) {
|
||||
free_bytes += chunk->size;
|
||||
if (chunk->size > largest_free_chunk) {
|
||||
largest_free_chunk = chunk->size;
|
||||
}
|
||||
}
|
||||
chunk_handle = chunk->next;
|
||||
}
|
||||
}
|
||||
double frag_metric = 0.0;
|
||||
if (free_bytes > 0) {
|
||||
frag_metric =
|
||||
(free_bytes - largest_free_chunk) / static_cast<double>(free_bytes);
|
||||
}
|
||||
|
||||
return frag_metric;
|
||||
}
|
||||
|
||||
absl::optional<AllocatorStats> BFCAllocator::GetStats() {
|
||||
mutex_lock l(lock_);
|
||||
return stats_;
|
||||
|
@ -115,6 +115,11 @@ class BFCAllocator : public Allocator {
|
||||
bool MergeTimestampedChunks(size_t required_bytes)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Add TraceMe (in memory allocation and deallocation) for memory stats
|
||||
// profiling.
|
||||
void AddTraceMe(absl::string_view traceme_name)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// A ChunkHandle is an index into the chunks_ vector in BFCAllocator
|
||||
// kInvalidChunkHandle means an invalid chunk
|
||||
typedef size_t ChunkHandle;
|
||||
@ -438,6 +443,10 @@ class BFCAllocator : public Allocator {
|
||||
ChunkHandle TryToCoalesce(ChunkHandle h, bool ignore_freed_at)
|
||||
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Fragmentation is calculated as the reverse ratio of the largest free chunk
|
||||
// size over total free memory, and returns a value within [0, 1].
|
||||
double GetFragmentation() EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||
|
||||
// Information about a Bin that is useful for debugging.
|
||||
struct BinDebugInfo {
|
||||
size_t total_bytes_in_use = 0;
|
||||
|
@ -40,6 +40,8 @@ static const absl::string_view kHostEventTypeMetadataMap[] = {
|
||||
"EagerKernelExecute",
|
||||
"ExecutorState::Process",
|
||||
"ExecutorDoneCallback",
|
||||
"MemoryAllocation",
|
||||
"MemoryDeallocation",
|
||||
// tf data captured function events.
|
||||
"InstantiatedCapturedFunction::Run",
|
||||
"InstantiatedCapturedFunction::RunWithBorrowedArgs",
|
||||
@ -81,10 +83,12 @@ static const absl::string_view kStatTypeStrMap[] = {
|
||||
"step_num",
|
||||
"iter_num",
|
||||
"index_on_host",
|
||||
"allocator_name",
|
||||
"bytes_reserved",
|
||||
"bytes_allocated",
|
||||
"bytes_available",
|
||||
"fragmentation",
|
||||
"peak_bytes_in_use",
|
||||
"device_id",
|
||||
"context_id",
|
||||
"correlation_id",
|
||||
@ -136,10 +140,12 @@ const absl::flat_hash_map<absl::string_view, StatType>& GetStatTypeMap() {
|
||||
{"step_num", kStepNum},
|
||||
{"iter_num", kIterNum},
|
||||
{"index_on_host", kIndexOnHost},
|
||||
{"allocator_name", kAllocatorName},
|
||||
{"bytes_reserved", kBytesReserved},
|
||||
{"bytes_allocated", kBytesAllocated},
|
||||
{"bytes_available", kBytesAvailable},
|
||||
{"fragmentation", kFragmentation},
|
||||
{"peak_bytes_in_use", kPeakBytesInUse},
|
||||
// Device trace arguments.
|
||||
{"device_id", kDeviceId},
|
||||
{"context_id", kContextId},
|
||||
|
@ -40,6 +40,8 @@ enum HostEventType {
|
||||
kEagerKernelExecute,
|
||||
kExecutorStateProcess,
|
||||
kExecutorDoneCallback,
|
||||
kMemoryAllocation,
|
||||
kMemoryDeallocation,
|
||||
// tf.data captured function events.
|
||||
kTfDataCapturedFunctionRun,
|
||||
kTfDataCapturedFunctionRunWithBorrowedArgs,
|
||||
@ -80,10 +82,12 @@ enum StatType {
|
||||
kStepNum,
|
||||
kIterNum,
|
||||
kIndexOnHost,
|
||||
kAllocatorName,
|
||||
kBytesReserved,
|
||||
kBytesAllocated,
|
||||
kBytesAvailable,
|
||||
kFragmentation,
|
||||
kPeakBytesInUse,
|
||||
// Device trace arguments.
|
||||
kDeviceId,
|
||||
kContextId,
|
||||
|
Loading…
Reference in New Issue
Block a user