Add memory stats profiling in BFCAllocator.
PiperOrigin-RevId: 289694152 Change-Id: I67792021ec90e67f5ccd47d391bf11ef36ff23ed
This commit is contained in:
parent
9b58a04025
commit
cb2ca1e812
@ -2779,7 +2779,9 @@ cc_library(
|
|||||||
":protos_all_cc",
|
":protos_all_cc",
|
||||||
":shared_counter",
|
":shared_counter",
|
||||||
"//tensorflow/core/framework:allocator",
|
"//tensorflow/core/framework:allocator",
|
||||||
|
"//tensorflow/core/profiler/lib:traceme",
|
||||||
"@com_google_absl//absl/container:flat_hash_set",
|
"@com_google_absl//absl/container:flat_hash_set",
|
||||||
|
"@com_google_absl//absl/strings",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -17,6 +17,7 @@ limitations under the License.
|
|||||||
|
|
||||||
#include <atomic>
|
#include <atomic>
|
||||||
|
|
||||||
|
#include "absl/strings/string_view.h"
|
||||||
#include "tensorflow/core/common_runtime/allocator_retry.h"
|
#include "tensorflow/core/common_runtime/allocator_retry.h"
|
||||||
#include "tensorflow/core/lib/core/bits.h"
|
#include "tensorflow/core/lib/core/bits.h"
|
||||||
#include "tensorflow/core/lib/strings/numbers.h"
|
#include "tensorflow/core/lib/strings/numbers.h"
|
||||||
@ -29,6 +30,7 @@ limitations under the License.
|
|||||||
#include "tensorflow/core/platform/stacktrace.h"
|
#include "tensorflow/core/platform/stacktrace.h"
|
||||||
#endif
|
#endif
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
|
#include "tensorflow/core/profiler/lib/traceme.h"
|
||||||
#include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
|
#include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
|
||||||
|
|
||||||
namespace tensorflow {
|
namespace tensorflow {
|
||||||
@ -380,6 +382,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
|||||||
}
|
}
|
||||||
void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
||||||
if (ptr != nullptr) {
|
if (ptr != nullptr) {
|
||||||
|
AddTraceMe("MemoryAllocation");
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -387,6 +390,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
|||||||
if (Extend(unused_alignment, rounded_bytes)) {
|
if (Extend(unused_alignment, rounded_bytes)) {
|
||||||
ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
||||||
if (ptr != nullptr) {
|
if (ptr != nullptr) {
|
||||||
|
AddTraceMe("MemoryAllocation");
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -399,6 +403,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
|||||||
if (MergeTimestampedChunks(rounded_bytes)) {
|
if (MergeTimestampedChunks(rounded_bytes)) {
|
||||||
ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
||||||
if (ptr != nullptr) {
|
if (ptr != nullptr) {
|
||||||
|
AddTraceMe("MemoryAllocation");
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -412,6 +417,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
|||||||
Extend(unused_alignment, rounded_bytes)) {
|
Extend(unused_alignment, rounded_bytes)) {
|
||||||
ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
|
||||||
if (ptr != nullptr) {
|
if (ptr != nullptr) {
|
||||||
|
AddTraceMe("MemoryAllocation");
|
||||||
return ptr;
|
return ptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -435,6 +441,24 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
|
|||||||
return nullptr;
|
return nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void BFCAllocator::AddTraceMe(absl::string_view traceme_name) {
|
||||||
|
tensorflow::profiler::TraceMe trace_me(
|
||||||
|
[&]() EXCLUSIVE_LOCKS_REQUIRED(lock_) {
|
||||||
|
AllocatorStats stats = stats_;
|
||||||
|
double fragmentation = GetFragmentation();
|
||||||
|
int64 bytes_available =
|
||||||
|
memory_limit_ - stats.bytes_reserved - stats.bytes_in_use;
|
||||||
|
return absl::StrCat(traceme_name, "#allocator_name=", name_,
|
||||||
|
",bytes_reserved=", stats.bytes_reserved,
|
||||||
|
",bytes_allocated=", stats.bytes_in_use,
|
||||||
|
",bytes_available=", bytes_available,
|
||||||
|
",fragmentation=", fragmentation,
|
||||||
|
",peak_bytes_in_use=", stats.peak_bytes_in_use,
|
||||||
|
"#");
|
||||||
|
},
|
||||||
|
/*level=*/2);
|
||||||
|
}
|
||||||
|
|
||||||
void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
|
void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
|
||||||
size_t num_bytes, uint64 freed_before) {
|
size_t num_bytes, uint64 freed_before) {
|
||||||
// First identify the first bin that could satisfy rounded_bytes.
|
// First identify the first bin that could satisfy rounded_bytes.
|
||||||
@ -580,6 +604,8 @@ void BFCAllocator::DeallocateRawInternal(void* ptr) {
|
|||||||
if (VLOG_IS_ON(4)) {
|
if (VLOG_IS_ON(4)) {
|
||||||
LOG(INFO) << "F: " << RenderOccupancy();
|
LOG(INFO) << "F: " << RenderOccupancy();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
AddTraceMe("MemoryDeallocation");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
|
// Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
|
||||||
@ -1009,8 +1035,6 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
|
|||||||
mas->set_bytes_in_use(stats_.bytes_in_use);
|
mas->set_bytes_in_use(stats_.bytes_in_use);
|
||||||
mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use);
|
mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use);
|
||||||
mas->set_largest_alloc_size(stats_.largest_alloc_size);
|
mas->set_largest_alloc_size(stats_.largest_alloc_size);
|
||||||
int64 largest_free_chunk = 0;
|
|
||||||
int64 free_bytes = 0;
|
|
||||||
|
|
||||||
// Record summary data for every bin.
|
// Record summary data for every bin.
|
||||||
const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
|
const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
|
||||||
@ -1046,21 +1070,11 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
|
|||||||
if (timing_counter_) {
|
if (timing_counter_) {
|
||||||
mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
|
mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
|
||||||
}
|
}
|
||||||
if (!c->in_use()) {
|
|
||||||
free_bytes += c->size;
|
|
||||||
if (c->size > largest_free_chunk) {
|
|
||||||
largest_free_chunk = c->size;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
h = c->next;
|
h = c->next;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
double frag_metric = 0.0;
|
|
||||||
if (free_bytes > 0) {
|
mas->set_fragmentation_metric(GetFragmentation());
|
||||||
frag_metric =
|
|
||||||
(free_bytes - largest_free_chunk) / static_cast<double>(free_bytes);
|
|
||||||
}
|
|
||||||
mas->set_fragmentation_metric(frag_metric);
|
|
||||||
|
|
||||||
#ifdef TENSORFLOW_MEM_DEBUG
|
#ifdef TENSORFLOW_MEM_DEBUG
|
||||||
// Record the recent size history
|
// Record the recent size history
|
||||||
@ -1077,6 +1091,31 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() {
|
|||||||
return md;
|
return md;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
double BFCAllocator::GetFragmentation() {
|
||||||
|
int64 largest_free_chunk = 0;
|
||||||
|
int64 free_bytes = 0;
|
||||||
|
for (const auto& region : region_manager_.regions()) {
|
||||||
|
ChunkHandle chunk_handle = region_manager_.get_handle(region.ptr());
|
||||||
|
while (chunk_handle != kInvalidChunkHandle) {
|
||||||
|
const Chunk* chunk = ChunkFromHandle(chunk_handle);
|
||||||
|
if (!chunk->in_use()) {
|
||||||
|
free_bytes += chunk->size;
|
||||||
|
if (chunk->size > largest_free_chunk) {
|
||||||
|
largest_free_chunk = chunk->size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
chunk_handle = chunk->next;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
double frag_metric = 0.0;
|
||||||
|
if (free_bytes > 0) {
|
||||||
|
frag_metric =
|
||||||
|
(free_bytes - largest_free_chunk) / static_cast<double>(free_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
|
return frag_metric;
|
||||||
|
}
|
||||||
|
|
||||||
absl::optional<AllocatorStats> BFCAllocator::GetStats() {
|
absl::optional<AllocatorStats> BFCAllocator::GetStats() {
|
||||||
mutex_lock l(lock_);
|
mutex_lock l(lock_);
|
||||||
return stats_;
|
return stats_;
|
||||||
|
@ -115,6 +115,11 @@ class BFCAllocator : public Allocator {
|
|||||||
bool MergeTimestampedChunks(size_t required_bytes)
|
bool MergeTimestampedChunks(size_t required_bytes)
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||||
|
|
||||||
|
// Add TraceMe (in memory allocation and deallocation) for memory stats
|
||||||
|
// profiling.
|
||||||
|
void AddTraceMe(absl::string_view traceme_name)
|
||||||
|
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||||
|
|
||||||
// A ChunkHandle is an index into the chunks_ vector in BFCAllocator
|
// A ChunkHandle is an index into the chunks_ vector in BFCAllocator
|
||||||
// kInvalidChunkHandle means an invalid chunk
|
// kInvalidChunkHandle means an invalid chunk
|
||||||
typedef size_t ChunkHandle;
|
typedef size_t ChunkHandle;
|
||||||
@ -438,6 +443,10 @@ class BFCAllocator : public Allocator {
|
|||||||
ChunkHandle TryToCoalesce(ChunkHandle h, bool ignore_freed_at)
|
ChunkHandle TryToCoalesce(ChunkHandle h, bool ignore_freed_at)
|
||||||
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||||
|
|
||||||
|
// Fragmentation is calculated as the reverse ratio of the largest free chunk
|
||||||
|
// size over total free memory, and returns a value within [0, 1].
|
||||||
|
double GetFragmentation() EXCLUSIVE_LOCKS_REQUIRED(lock_);
|
||||||
|
|
||||||
// Information about a Bin that is useful for debugging.
|
// Information about a Bin that is useful for debugging.
|
||||||
struct BinDebugInfo {
|
struct BinDebugInfo {
|
||||||
size_t total_bytes_in_use = 0;
|
size_t total_bytes_in_use = 0;
|
||||||
|
@ -40,6 +40,8 @@ static const absl::string_view kHostEventTypeMetadataMap[] = {
|
|||||||
"EagerKernelExecute",
|
"EagerKernelExecute",
|
||||||
"ExecutorState::Process",
|
"ExecutorState::Process",
|
||||||
"ExecutorDoneCallback",
|
"ExecutorDoneCallback",
|
||||||
|
"MemoryAllocation",
|
||||||
|
"MemoryDeallocation",
|
||||||
// tf data captured function events.
|
// tf data captured function events.
|
||||||
"InstantiatedCapturedFunction::Run",
|
"InstantiatedCapturedFunction::Run",
|
||||||
"InstantiatedCapturedFunction::RunWithBorrowedArgs",
|
"InstantiatedCapturedFunction::RunWithBorrowedArgs",
|
||||||
@ -81,10 +83,12 @@ static const absl::string_view kStatTypeStrMap[] = {
|
|||||||
"step_num",
|
"step_num",
|
||||||
"iter_num",
|
"iter_num",
|
||||||
"index_on_host",
|
"index_on_host",
|
||||||
|
"allocator_name",
|
||||||
"bytes_reserved",
|
"bytes_reserved",
|
||||||
"bytes_allocated",
|
"bytes_allocated",
|
||||||
"bytes_available",
|
"bytes_available",
|
||||||
"fragmentation",
|
"fragmentation",
|
||||||
|
"peak_bytes_in_use",
|
||||||
"device_id",
|
"device_id",
|
||||||
"context_id",
|
"context_id",
|
||||||
"correlation_id",
|
"correlation_id",
|
||||||
@ -136,10 +140,12 @@ const absl::flat_hash_map<absl::string_view, StatType>& GetStatTypeMap() {
|
|||||||
{"step_num", kStepNum},
|
{"step_num", kStepNum},
|
||||||
{"iter_num", kIterNum},
|
{"iter_num", kIterNum},
|
||||||
{"index_on_host", kIndexOnHost},
|
{"index_on_host", kIndexOnHost},
|
||||||
|
{"allocator_name", kAllocatorName},
|
||||||
{"bytes_reserved", kBytesReserved},
|
{"bytes_reserved", kBytesReserved},
|
||||||
{"bytes_allocated", kBytesAllocated},
|
{"bytes_allocated", kBytesAllocated},
|
||||||
{"bytes_available", kBytesAvailable},
|
{"bytes_available", kBytesAvailable},
|
||||||
{"fragmentation", kFragmentation},
|
{"fragmentation", kFragmentation},
|
||||||
|
{"peak_bytes_in_use", kPeakBytesInUse},
|
||||||
// Device trace arguments.
|
// Device trace arguments.
|
||||||
{"device_id", kDeviceId},
|
{"device_id", kDeviceId},
|
||||||
{"context_id", kContextId},
|
{"context_id", kContextId},
|
||||||
|
@ -40,6 +40,8 @@ enum HostEventType {
|
|||||||
kEagerKernelExecute,
|
kEagerKernelExecute,
|
||||||
kExecutorStateProcess,
|
kExecutorStateProcess,
|
||||||
kExecutorDoneCallback,
|
kExecutorDoneCallback,
|
||||||
|
kMemoryAllocation,
|
||||||
|
kMemoryDeallocation,
|
||||||
// tf.data captured function events.
|
// tf.data captured function events.
|
||||||
kTfDataCapturedFunctionRun,
|
kTfDataCapturedFunctionRun,
|
||||||
kTfDataCapturedFunctionRunWithBorrowedArgs,
|
kTfDataCapturedFunctionRunWithBorrowedArgs,
|
||||||
@ -80,10 +82,12 @@ enum StatType {
|
|||||||
kStepNum,
|
kStepNum,
|
||||||
kIterNum,
|
kIterNum,
|
||||||
kIndexOnHost,
|
kIndexOnHost,
|
||||||
|
kAllocatorName,
|
||||||
kBytesReserved,
|
kBytesReserved,
|
||||||
kBytesAllocated,
|
kBytesAllocated,
|
||||||
kBytesAvailable,
|
kBytesAvailable,
|
||||||
kFragmentation,
|
kFragmentation,
|
||||||
|
kPeakBytesInUse,
|
||||||
// Device trace arguments.
|
// Device trace arguments.
|
||||||
kDeviceId,
|
kDeviceId,
|
||||||
kContextId,
|
kContextId,
|
||||||
|
Loading…
Reference in New Issue
Block a user