diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 334a87794b0..1c9bddd1dbc 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -2779,7 +2779,9 @@ cc_library( ":protos_all_cc", ":shared_counter", "//tensorflow/core/framework:allocator", + "//tensorflow/core/profiler/lib:traceme", "@com_google_absl//absl/container:flat_hash_set", + "@com_google_absl//absl/strings", ], ) diff --git a/tensorflow/core/common_runtime/bfc_allocator.cc b/tensorflow/core/common_runtime/bfc_allocator.cc index c43e72c7914..9e3bcd81ae4 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.cc +++ b/tensorflow/core/common_runtime/bfc_allocator.cc @@ -17,6 +17,7 @@ limitations under the License. #include +#include "absl/strings/string_view.h" #include "tensorflow/core/common_runtime/allocator_retry.h" #include "tensorflow/core/lib/core/bits.h" #include "tensorflow/core/lib/strings/numbers.h" @@ -29,6 +30,7 @@ limitations under the License. #include "tensorflow/core/platform/stacktrace.h" #endif #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/profiler/lib/traceme.h" #include "tensorflow/core/protobuf/bfc_memory_map.pb.h" namespace tensorflow { @@ -380,6 +382,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment, } void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before); if (ptr != nullptr) { + AddTraceMe("MemoryAllocation"); return ptr; } @@ -387,6 +390,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment, if (Extend(unused_alignment, rounded_bytes)) { ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before); if (ptr != nullptr) { + AddTraceMe("MemoryAllocation"); return ptr; } } @@ -399,6 +403,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment, if (MergeTimestampedChunks(rounded_bytes)) { ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before); if (ptr != nullptr) { + AddTraceMe("MemoryAllocation"); return ptr; } } @@ -412,6 +417,7 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment, Extend(unused_alignment, rounded_bytes)) { ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before); if (ptr != nullptr) { + AddTraceMe("MemoryAllocation"); return ptr; } } @@ -435,6 +441,24 @@ void* BFCAllocator::AllocateRawInternal(size_t unused_alignment, return nullptr; } +void BFCAllocator::AddTraceMe(absl::string_view traceme_name) { + tensorflow::profiler::TraceMe trace_me( + [&]() EXCLUSIVE_LOCKS_REQUIRED(lock_) { + AllocatorStats stats = stats_; + double fragmentation = GetFragmentation(); + int64 bytes_available = + memory_limit_ - stats.bytes_reserved - stats.bytes_in_use; + return absl::StrCat(traceme_name, "#allocator_name=", name_, + ",bytes_reserved=", stats.bytes_reserved, + ",bytes_allocated=", stats.bytes_in_use, + ",bytes_available=", bytes_available, + ",fragmentation=", fragmentation, + ",peak_bytes_in_use=", stats.peak_bytes_in_use, + "#"); + }, + /*level=*/2); +} + void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes, uint64 freed_before) { // First identify the first bin that could satisfy rounded_bytes. @@ -580,6 +604,8 @@ void BFCAllocator::DeallocateRawInternal(void* ptr) { if (VLOG_IS_ON(4)) { LOG(INFO) << "F: " << RenderOccupancy(); } + + AddTraceMe("MemoryDeallocation"); } // Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1. @@ -1009,8 +1035,6 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() { mas->set_bytes_in_use(stats_.bytes_in_use); mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use); mas->set_largest_alloc_size(stats_.largest_alloc_size); - int64 largest_free_chunk = 0; - int64 free_bytes = 0; // Record summary data for every bin. const std::array bin_infos = get_bin_debug_info(); @@ -1046,21 +1070,11 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() { if (timing_counter_) { mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count); } - if (!c->in_use()) { - free_bytes += c->size; - if (c->size > largest_free_chunk) { - largest_free_chunk = c->size; - } - } h = c->next; } } - double frag_metric = 0.0; - if (free_bytes > 0) { - frag_metric = - (free_bytes - largest_free_chunk) / static_cast(free_bytes); - } - mas->set_fragmentation_metric(frag_metric); + + mas->set_fragmentation_metric(GetFragmentation()); #ifdef TENSORFLOW_MEM_DEBUG // Record the recent size history @@ -1077,6 +1091,31 @@ MemoryDump BFCAllocator::RecordMemoryMapInternal() { return md; } +double BFCAllocator::GetFragmentation() { + int64 largest_free_chunk = 0; + int64 free_bytes = 0; + for (const auto& region : region_manager_.regions()) { + ChunkHandle chunk_handle = region_manager_.get_handle(region.ptr()); + while (chunk_handle != kInvalidChunkHandle) { + const Chunk* chunk = ChunkFromHandle(chunk_handle); + if (!chunk->in_use()) { + free_bytes += chunk->size; + if (chunk->size > largest_free_chunk) { + largest_free_chunk = chunk->size; + } + } + chunk_handle = chunk->next; + } + } + double frag_metric = 0.0; + if (free_bytes > 0) { + frag_metric = + (free_bytes - largest_free_chunk) / static_cast(free_bytes); + } + + return frag_metric; +} + absl::optional BFCAllocator::GetStats() { mutex_lock l(lock_); return stats_; diff --git a/tensorflow/core/common_runtime/bfc_allocator.h b/tensorflow/core/common_runtime/bfc_allocator.h index 209eb0eed54..2dd7125f5c6 100644 --- a/tensorflow/core/common_runtime/bfc_allocator.h +++ b/tensorflow/core/common_runtime/bfc_allocator.h @@ -115,6 +115,11 @@ class BFCAllocator : public Allocator { bool MergeTimestampedChunks(size_t required_bytes) EXCLUSIVE_LOCKS_REQUIRED(lock_); + // Add TraceMe (in memory allocation and deallocation) for memory stats + // profiling. + void AddTraceMe(absl::string_view traceme_name) + EXCLUSIVE_LOCKS_REQUIRED(lock_); + // A ChunkHandle is an index into the chunks_ vector in BFCAllocator // kInvalidChunkHandle means an invalid chunk typedef size_t ChunkHandle; @@ -438,6 +443,10 @@ class BFCAllocator : public Allocator { ChunkHandle TryToCoalesce(ChunkHandle h, bool ignore_freed_at) EXCLUSIVE_LOCKS_REQUIRED(lock_); + // Fragmentation is calculated as the reverse ratio of the largest free chunk + // size over total free memory, and returns a value within [0, 1]. + double GetFragmentation() EXCLUSIVE_LOCKS_REQUIRED(lock_); + // Information about a Bin that is useful for debugging. struct BinDebugInfo { size_t total_bytes_in_use = 0; diff --git a/tensorflow/core/profiler/utils/xplane_schema.cc b/tensorflow/core/profiler/utils/xplane_schema.cc index 9a9cefe3536..39e14ef2a28 100644 --- a/tensorflow/core/profiler/utils/xplane_schema.cc +++ b/tensorflow/core/profiler/utils/xplane_schema.cc @@ -40,6 +40,8 @@ static const absl::string_view kHostEventTypeMetadataMap[] = { "EagerKernelExecute", "ExecutorState::Process", "ExecutorDoneCallback", + "MemoryAllocation", + "MemoryDeallocation", // tf data captured function events. "InstantiatedCapturedFunction::Run", "InstantiatedCapturedFunction::RunWithBorrowedArgs", @@ -81,10 +83,12 @@ static const absl::string_view kStatTypeStrMap[] = { "step_num", "iter_num", "index_on_host", + "allocator_name", "bytes_reserved", "bytes_allocated", "bytes_available", "fragmentation", + "peak_bytes_in_use", "device_id", "context_id", "correlation_id", @@ -136,10 +140,12 @@ const absl::flat_hash_map& GetStatTypeMap() { {"step_num", kStepNum}, {"iter_num", kIterNum}, {"index_on_host", kIndexOnHost}, + {"allocator_name", kAllocatorName}, {"bytes_reserved", kBytesReserved}, {"bytes_allocated", kBytesAllocated}, {"bytes_available", kBytesAvailable}, {"fragmentation", kFragmentation}, + {"peak_bytes_in_use", kPeakBytesInUse}, // Device trace arguments. {"device_id", kDeviceId}, {"context_id", kContextId}, diff --git a/tensorflow/core/profiler/utils/xplane_schema.h b/tensorflow/core/profiler/utils/xplane_schema.h index 842123bc771..743fedf33aa 100644 --- a/tensorflow/core/profiler/utils/xplane_schema.h +++ b/tensorflow/core/profiler/utils/xplane_schema.h @@ -40,6 +40,8 @@ enum HostEventType { kEagerKernelExecute, kExecutorStateProcess, kExecutorDoneCallback, + kMemoryAllocation, + kMemoryDeallocation, // tf.data captured function events. kTfDataCapturedFunctionRun, kTfDataCapturedFunctionRunWithBorrowedArgs, @@ -80,10 +82,12 @@ enum StatType { kStepNum, kIterNum, kIndexOnHost, + kAllocatorName, kBytesReserved, kBytesAllocated, kBytesAvailable, kFragmentation, + kPeakBytesInUse, // Device trace arguments. kDeviceId, kContextId,