Create ConvertXPlaneToMemoryProfile function for OSS TF memory profiler.
PiperOrigin-RevId: 309666664 Change-Id: If89abf5cf2dca667e7c712ecde0d086e793cfc1b
This commit is contained in:
parent
078aa26e41
commit
308bc07737
@ -256,6 +256,7 @@ cc_library(
|
||||
":op_stats_to_overview_page",
|
||||
":op_stats_to_tf_stats",
|
||||
":trace_events_to_json",
|
||||
":xplane_to_memory_profile",
|
||||
":xplane_to_op_stats",
|
||||
":xplane_to_trace_events",
|
||||
"//tensorflow/core:lib",
|
||||
@ -263,11 +264,14 @@ cc_library(
|
||||
"//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:xplane_proto_cc",
|
||||
"//tensorflow/core/profiler/rpc/client:save_profile",
|
||||
"//tensorflow/core/profiler/utils:xplane_schema",
|
||||
"//tensorflow/core/profiler/utils:xplane_utils",
|
||||
"@com_google_absl//absl/container:flat_hash_set",
|
||||
"@com_google_absl//absl/strings",
|
||||
],
|
||||
@ -414,3 +418,37 @@ tf_cc_test(
|
||||
"//tensorflow/core/profiler/utils:xplane_utils",
|
||||
],
|
||||
)
|
||||
|
||||
cc_library(
|
||||
name = "xplane_to_memory_profile",
|
||||
srcs = ["xplane_to_memory_profile.cc"],
|
||||
hdrs = ["xplane_to_memory_profile.h"],
|
||||
deps = [
|
||||
"//tensorflow/core:framework",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:lib_internal",
|
||||
"//tensorflow/core/platform:protobuf",
|
||||
"//tensorflow/core/profiler/protobuf:memory_profile_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:xplane_proto_cc",
|
||||
"//tensorflow/core/profiler/utils:tf_xplane_visitor",
|
||||
"//tensorflow/core/profiler/utils:xplane_schema",
|
||||
"//tensorflow/core/profiler/utils:xplane_utils",
|
||||
"@com_google_absl//absl/container:flat_hash_map",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@com_google_absl//absl/strings:str_format",
|
||||
],
|
||||
)
|
||||
|
||||
tf_cc_test(
|
||||
name = "xplane_to_memory_profile_test",
|
||||
size = "small",
|
||||
srcs = ["xplane_to_memory_profile_test.cc"],
|
||||
deps = [
|
||||
":xplane_to_memory_profile",
|
||||
"//tensorflow/core:test",
|
||||
"//tensorflow/core:test_main",
|
||||
"//tensorflow/core/profiler/utils:xplane_builder",
|
||||
"//tensorflow/core/profiler/utils:xplane_schema",
|
||||
"//tensorflow/core/profiler/utils:xplane_utils",
|
||||
],
|
||||
)
|
||||
|
467
tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
Normal file
467
tensorflow/core/profiler/convert/xplane_to_memory_profile.cc
Normal file
@ -0,0 +1,467 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
|
||||
|
||||
#include <cstddef>
|
||||
#include <string>
|
||||
#include <tuple>
|
||||
#include <utility>
|
||||
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tensorflow/core/framework/types.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/platform/protobuf.h"
|
||||
#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
|
||||
#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
|
||||
#include "tensorflow/core/profiler/utils/xplane_schema.h"
|
||||
#include "tensorflow/core/profiler/utils/xplane_utils.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
namespace {
|
||||
|
||||
// Index of the time-sorted memory_profile_snapshots list, and the
|
||||
// MemoryActivityMetadata proto it contains.
|
||||
using IndexMetaPair = std::pair<int64 /*index*/, const MemoryActivityMetadata*>;
|
||||
|
||||
// Aggregated memory stats from an allocator. Temporary container to fill
|
||||
// MemoryAggregationStats.
|
||||
struct AggregationStats {
|
||||
int64 bytes_reserved = 0;
|
||||
int64 bytes_allocated = 0;
|
||||
int64 bytes_available = 0;
|
||||
double fragmentation = 0;
|
||||
int64 peak_bytes_in_use = 0;
|
||||
};
|
||||
|
||||
// Metadata associated with each memory allocation/deallocation activity.
|
||||
// Temporary container to fill MemoryActivityMetadata.
|
||||
struct ActivityMetadata {
|
||||
int64 requested_bytes = 0;
|
||||
int64 allocation_bytes = 0;
|
||||
uint64 address = 0;
|
||||
absl::string_view tf_op_name;
|
||||
int64 step_id = -1;
|
||||
absl::string_view region_type;
|
||||
int64 data_type = 0;
|
||||
absl::string_view tensor_shape;
|
||||
};
|
||||
|
||||
bool IsMemoryAllocation(int64 event_type) {
|
||||
return event_type == HostEventType::kMemoryAllocation;
|
||||
}
|
||||
|
||||
bool IsMemoryDeallocation(int64 event_type) {
|
||||
return event_type == HostEventType::kMemoryDeallocation;
|
||||
}
|
||||
|
||||
void FillAggregationStats(const AggregationStats& src,
|
||||
MemoryAggregationStats* dst) {
|
||||
dst->set_stack_reserved_bytes(src.bytes_reserved);
|
||||
dst->set_heap_allocated_bytes(src.bytes_allocated);
|
||||
dst->set_free_memory_bytes(src.bytes_available);
|
||||
dst->set_fragmentation(src.fragmentation);
|
||||
dst->set_peak_bytes_in_use(src.peak_bytes_in_use);
|
||||
}
|
||||
|
||||
void FillActivityMetadata(int64 event_type, const ActivityMetadata& src,
|
||||
MemoryActivityMetadata* dst) {
|
||||
if (IsMemoryAllocation(event_type)) {
|
||||
dst->set_memory_activity(ALLOCATION);
|
||||
} else if (IsMemoryDeallocation(event_type)) {
|
||||
dst->set_memory_activity(DEALLOCATION);
|
||||
}
|
||||
dst->set_requested_bytes(src.requested_bytes);
|
||||
dst->set_allocation_bytes(src.allocation_bytes);
|
||||
dst->set_address(src.address);
|
||||
dst->set_tf_op_name(std::string(src.tf_op_name));
|
||||
dst->set_step_id(src.step_id);
|
||||
dst->set_region_type(std::string(src.region_type));
|
||||
dst->set_data_type(tensorflow::DataTypeString(
|
||||
static_cast<tensorflow::DataType>(src.data_type)));
|
||||
dst->set_tensor_shape(std::string(src.tensor_shape));
|
||||
}
|
||||
|
||||
void UpdateProfileSummary(const AggregationStats& stats, int64 time_offset_ps,
|
||||
MemoryProfileSummary* summary) {
|
||||
// Update the peak memory usage over allocator's lifetime.
|
||||
summary->set_peak_bytes_usage_lifetime(stats.peak_bytes_in_use);
|
||||
MemoryAggregationStats* peak_stats = summary->mutable_peak_stats();
|
||||
// If we reach (or stay at) peak memory usage within the profiling window,
|
||||
// update memory profile summary.
|
||||
if (stats.bytes_reserved + stats.bytes_allocated >=
|
||||
peak_stats->peak_bytes_in_use()) {
|
||||
peak_stats->set_peak_bytes_in_use(stats.bytes_reserved +
|
||||
stats.bytes_allocated);
|
||||
peak_stats->set_stack_reserved_bytes(stats.bytes_reserved);
|
||||
peak_stats->set_heap_allocated_bytes(stats.bytes_allocated);
|
||||
peak_stats->set_free_memory_bytes(stats.bytes_available);
|
||||
peak_stats->set_fragmentation(stats.fragmentation);
|
||||
summary->set_peak_stats_time_ps(time_offset_ps);
|
||||
summary->set_memory_capacity(stats.bytes_reserved + stats.bytes_allocated +
|
||||
stats.bytes_available);
|
||||
}
|
||||
}
|
||||
|
||||
// Generate memory profile proto by processing host trace XPlane.
|
||||
MemoryProfile GenerateMemoryProfile(const XPlane* host_trace) {
|
||||
XPlaneVisitor plane = CreateTfXPlaneVisitor(host_trace);
|
||||
MemoryProfile memory_profile;
|
||||
auto* step_count = memory_profile.mutable_step_count();
|
||||
// Iterate over all XEvents in the XPlane, and add the XStats to a new
|
||||
// MemoryProfileSnapshot if the EventType is kMemoryAllocation or
|
||||
// kMemoryDeallocation.
|
||||
plane.ForEachLine([&](const XLineVisitor& line) {
|
||||
line.ForEachEvent([&](const XEventVisitor& event) {
|
||||
int64 event_type = event.Type().value_or(kUnknownHostEventType);
|
||||
if (!(IsMemoryAllocation(event_type) ||
|
||||
IsMemoryDeallocation(event_type))) {
|
||||
return;
|
||||
}
|
||||
|
||||
AggregationStats stats;
|
||||
ActivityMetadata metadata;
|
||||
std::string memory_id;
|
||||
event.ForEachStat([&](const XStatVisitor& stat) {
|
||||
if (stat.Type() == StatType::kIndexOnHost ||
|
||||
stat.Type() == StatType::kDeviceOrdinal) {
|
||||
memory_id = absl::StrFormat("%d", stat.IntValue());
|
||||
} else if (stat.Type() == StatType::kAllocatorName) {
|
||||
memory_id = stat.ToString();
|
||||
} else if (stat.Type() == StatType::kBytesReserved) {
|
||||
stats.bytes_reserved = stat.IntValue();
|
||||
} else if (stat.Type() == StatType::kBytesAllocated) {
|
||||
stats.bytes_allocated = stat.IntValue();
|
||||
} else if (stat.Type() == StatType::kBytesAvailable) {
|
||||
stats.bytes_available = stat.IntValue();
|
||||
} else if (stat.Type() == StatType::kFragmentation) {
|
||||
stats.fragmentation = stat.DoubleValue();
|
||||
} else if (stat.Type() == StatType::kPeakBytesInUse) {
|
||||
stats.peak_bytes_in_use = stat.IntValue();
|
||||
} else if (stat.Type() == StatType::kRequestedBytes) {
|
||||
metadata.requested_bytes = stat.IntValue();
|
||||
} else if (stat.Type() == StatType::kAllocationBytes) {
|
||||
metadata.allocation_bytes = stat.IntValue();
|
||||
} else if (stat.Type() == StatType::kAddress) {
|
||||
metadata.address = stat.IntValue();
|
||||
} else if (stat.Type() == StatType::kTfOp) {
|
||||
metadata.tf_op_name = stat.StrValue();
|
||||
} else if (stat.Type() == StatType::kStepId) {
|
||||
metadata.step_id = stat.IntValue();
|
||||
if (metadata.step_id != 0) (*step_count)[metadata.step_id]++;
|
||||
} else if (stat.Type() == StatType::kRegionType) {
|
||||
metadata.region_type = stat.StrValue();
|
||||
} else if (stat.Type() == StatType::kDataType) {
|
||||
metadata.data_type = stat.IntValue();
|
||||
} else if (stat.Type() == StatType::kTensorShapes) {
|
||||
metadata.tensor_shape = stat.StrValue();
|
||||
}
|
||||
});
|
||||
|
||||
MemoryProfileSnapshot* snapshot =
|
||||
(*memory_profile.mutable_memory_profile_per_allocator())[memory_id]
|
||||
.add_memory_profile_snapshots();
|
||||
snapshot->set_time_offset_ps(event.OffsetPs());
|
||||
FillAggregationStats(stats, snapshot->mutable_aggregation_stats());
|
||||
FillActivityMetadata(event_type, metadata,
|
||||
snapshot->mutable_activity_metadata());
|
||||
|
||||
MemoryProfileSummary* summary =
|
||||
(*memory_profile.mutable_memory_profile_per_allocator())[memory_id]
|
||||
.mutable_profile_summary();
|
||||
UpdateProfileSummary(stats, event.OffsetPs(), summary);
|
||||
});
|
||||
});
|
||||
return memory_profile;
|
||||
}
|
||||
|
||||
// Sequentialize step ids for the memory profile.
|
||||
void UpdateStepId(const tensorflow::protobuf::Map<
|
||||
tensorflow::protobuf_int64 /*orig_step_id*/,
|
||||
tensorflow::protobuf_int64 /*count*/>& step_count,
|
||||
PerAllocatorMemoryProfile* memory_profile) {
|
||||
// Map from original random step id to sequential step id.
|
||||
absl::flat_hash_map<int64 /*orig_step_id*/, int64 /*step_id*/> step_map;
|
||||
constexpr int kUnknownStep = -2;
|
||||
constexpr double kStepFilterRatio = 0.1; // Magic number for filtering.
|
||||
tensorflow::protobuf_int64 max_step_count = 0;
|
||||
for (const auto& step_and_count : step_count) {
|
||||
max_step_count = std::max(max_step_count, step_and_count.second);
|
||||
}
|
||||
// Filter out noisy and incomplete original step ids.
|
||||
for (const auto& step_and_count : step_count) {
|
||||
if (static_cast<double>(step_and_count.second) / max_step_count >
|
||||
kStepFilterRatio) {
|
||||
step_map[step_and_count.first] = kUnknownStep;
|
||||
}
|
||||
}
|
||||
|
||||
// Update the step ids in memory_profile for this allocator.
|
||||
int64 step_id = -1;
|
||||
for (auto& snapshot : *memory_profile->mutable_memory_profile_snapshots()) {
|
||||
DCHECK(snapshot.has_activity_metadata());
|
||||
// Convert the random step id to sequential step id.
|
||||
int64 orig_step_id = snapshot.activity_metadata().step_id();
|
||||
if (step_map.contains(orig_step_id) &&
|
||||
step_map[orig_step_id] == kUnknownStep) {
|
||||
step_map[orig_step_id] = ++step_id;
|
||||
}
|
||||
snapshot.mutable_activity_metadata()->set_step_id(step_id);
|
||||
}
|
||||
VLOG(2) << "Max sequential step id in profile: " << step_id;
|
||||
}
|
||||
|
||||
// Update the MemoryActivityMetadata for each deallocation event by copying from
|
||||
// matching allocation.
|
||||
void UpdateDeallocation(PerAllocatorMemoryProfile* memory_profile) {
|
||||
absl::flat_hash_map<uint64 /*address*/, const MemoryActivityMetadata*>
|
||||
addr_metadata_map;
|
||||
for (auto& snapshot : *memory_profile->mutable_memory_profile_snapshots()) {
|
||||
// Match the deallocation with previous allocation based on address.
|
||||
uint64 address = snapshot.activity_metadata().address();
|
||||
if (snapshot.activity_metadata().memory_activity() == DEALLOCATION) {
|
||||
if (addr_metadata_map.contains(address)) {
|
||||
const MemoryActivityMetadata* alloc_meta = addr_metadata_map[address];
|
||||
snapshot.mutable_activity_metadata()->set_tf_op_name(
|
||||
alloc_meta->tf_op_name());
|
||||
snapshot.mutable_activity_metadata()->set_region_type(
|
||||
alloc_meta->region_type());
|
||||
snapshot.mutable_activity_metadata()->set_data_type(
|
||||
alloc_meta->data_type());
|
||||
snapshot.mutable_activity_metadata()->set_tensor_shape(
|
||||
alloc_meta->tensor_shape());
|
||||
// In case of following (unexpected) deallocations to the same chunk
|
||||
// address, leave the metadata as it is (empty or already captured).
|
||||
addr_metadata_map.erase(address);
|
||||
} else {
|
||||
VLOG(2)
|
||||
<< "Can't find matching memory allocation for this deallocation: "
|
||||
<< snapshot.DebugString();
|
||||
}
|
||||
} else if (!addr_metadata_map.contains(address)) { // Allocation.
|
||||
addr_metadata_map[address] = &snapshot.activity_metadata();
|
||||
} else {
|
||||
VLOG(2) << "There are two allocations recorded for the same address: "
|
||||
<< address
|
||||
<< ". The later allocation event is: " << snapshot.DebugString();
|
||||
}
|
||||
}
|
||||
VLOG(2) << "Number of allocations that cannot find matching dealloctions: "
|
||||
<< addr_metadata_map.size();
|
||||
}
|
||||
|
||||
// Return the step id for the peak memory usage data point.
|
||||
int64 GetPeakMemoryStep(int64 peak_bytes_profile,
|
||||
const PerAllocatorMemoryProfile* memory_profile) {
|
||||
int64 peak_bytes_profile_step_id = 0;
|
||||
for (const auto& snapshot : memory_profile->memory_profile_snapshots()) {
|
||||
// Get the step id of the peak memory usage.
|
||||
if (peak_bytes_profile ==
|
||||
snapshot.aggregation_stats().heap_allocated_bytes() +
|
||||
snapshot.aggregation_stats().stack_reserved_bytes()) {
|
||||
DCHECK(snapshot.has_activity_metadata());
|
||||
peak_bytes_profile_step_id = snapshot.activity_metadata().step_id();
|
||||
}
|
||||
}
|
||||
return peak_bytes_profile_step_id;
|
||||
}
|
||||
|
||||
// Functor that compares (index, metadata) pair to sort in the order of
|
||||
// allocation bytes and requested bytes (descending), as well as TF Op name,
|
||||
// region type, data type, and tensor shape (ascending).
|
||||
struct MetadataComparator {
|
||||
bool operator()(const IndexMetaPair& a, const IndexMetaPair& b) const {
|
||||
const MemoryActivityMetadata* a_meta = a.second;
|
||||
const MemoryActivityMetadata* b_meta = b.second;
|
||||
DCHECK_NE(a_meta, nullptr);
|
||||
DCHECK_NE(b_meta, nullptr);
|
||||
|
||||
auto lhs =
|
||||
std::make_tuple(-a_meta->allocation_bytes(), -a_meta->requested_bytes(),
|
||||
a_meta->tf_op_name(), a_meta->region_type(),
|
||||
a_meta->data_type(), a_meta->tensor_shape());
|
||||
auto rhs =
|
||||
std::make_tuple(-b_meta->allocation_bytes(), -b_meta->requested_bytes(),
|
||||
b_meta->tf_op_name(), b_meta->region_type(),
|
||||
b_meta->data_type(), b_meta->tensor_shape());
|
||||
return lhs < rhs;
|
||||
}
|
||||
};
|
||||
|
||||
// If applicable, add items into active_allocs vector and special_allocations
|
||||
// proto for the unmapped memory usage (in heap) and stack reservation at peak.
|
||||
void InsertSpecialAllocations(int64 unmapped_allocation_bytes, int64 step_id,
|
||||
PerAllocatorMemoryProfile* memory_profile,
|
||||
std::vector<IndexMetaPair>* active_allocs) {
|
||||
int index = 0;
|
||||
if (unmapped_allocation_bytes > 0) {
|
||||
MemoryActivityMetadata* special_allocation =
|
||||
memory_profile->add_special_allocations();
|
||||
FillActivityMetadata(
|
||||
HostEventType::kMemoryAllocation,
|
||||
{unmapped_allocation_bytes, unmapped_allocation_bytes, 0,
|
||||
"preallocated/unknown", step_id, "persist", 0, "unknown"},
|
||||
special_allocation);
|
||||
active_allocs->push_back({--index, special_allocation});
|
||||
}
|
||||
int64 stack_bytes =
|
||||
memory_profile->profile_summary().peak_stats().stack_reserved_bytes();
|
||||
if (stack_bytes > 0) {
|
||||
MemoryActivityMetadata* special_allocation =
|
||||
memory_profile->add_special_allocations();
|
||||
FillActivityMetadata(
|
||||
HostEventType::kMemoryAllocation,
|
||||
{stack_bytes, stack_bytes, 0, "stack", step_id, "stack", 0, "unknown"},
|
||||
special_allocation);
|
||||
active_allocs->push_back({--index, special_allocation});
|
||||
}
|
||||
}
|
||||
|
||||
bool operator==(const IndexMetaPair& a, const IndexMetaPair& b) {
|
||||
const MemoryActivityMetadata* a_meta = a.second;
|
||||
const MemoryActivityMetadata* b_meta = b.second;
|
||||
return a_meta->allocation_bytes() == b_meta->allocation_bytes() &&
|
||||
a_meta->requested_bytes() == b_meta->requested_bytes() &&
|
||||
a_meta->tf_op_name() == b_meta->tf_op_name() &&
|
||||
a_meta->region_type() == b_meta->region_type() &&
|
||||
a_meta->data_type() == b_meta->data_type() &&
|
||||
a_meta->tensor_shape() == b_meta->tensor_shape();
|
||||
}
|
||||
|
||||
// Generate the memory breakdown table of active allocations at the peak usage
|
||||
// (within profiling window) and fill each ActiveAllocation proto (i.e. a row).
|
||||
void ProcessActiveAllocations(int64 peak_bytes_profile_step_id,
|
||||
PerAllocatorMemoryProfile* memory_profile) {
|
||||
int64 unmapped_allocation_bytes =
|
||||
memory_profile->profile_summary().peak_stats().heap_allocated_bytes();
|
||||
int64 unmapped_deallocation_bytes = 0;
|
||||
absl::flat_hash_map<int64 /*address*/, IndexMetaPair> active_alloc_map;
|
||||
// Only account for the memory activities in the step that includes peak
|
||||
// memory usage.
|
||||
for (int i = 0; i < memory_profile->memory_profile_snapshots_size(); i++) {
|
||||
const auto& snapshot = memory_profile->memory_profile_snapshots().at(i);
|
||||
DCHECK(snapshot.has_activity_metadata());
|
||||
const MemoryActivityMetadata& metadata = snapshot.activity_metadata();
|
||||
if (snapshot.time_offset_ps() >
|
||||
memory_profile->profile_summary().peak_stats_time_ps())
|
||||
break;
|
||||
if (metadata.step_id() != peak_bytes_profile_step_id) continue;
|
||||
|
||||
if (metadata.memory_activity() == ALLOCATION) {
|
||||
active_alloc_map[metadata.address()] = {i, &metadata};
|
||||
unmapped_allocation_bytes -= metadata.allocation_bytes();
|
||||
} else {
|
||||
DCHECK_EQ(metadata.memory_activity(), DEALLOCATION);
|
||||
if (active_alloc_map.contains(metadata.address())) {
|
||||
active_alloc_map.erase(metadata.address());
|
||||
} else {
|
||||
unmapped_deallocation_bytes += metadata.allocation_bytes();
|
||||
}
|
||||
unmapped_allocation_bytes += metadata.allocation_bytes();
|
||||
}
|
||||
}
|
||||
// This separates the persistent memory from the freed memory from last step's
|
||||
// allocations.
|
||||
unmapped_allocation_bytes -= unmapped_deallocation_bytes;
|
||||
|
||||
VLOG(2) << "unmapped_allocation_bytes=" << unmapped_allocation_bytes
|
||||
<< ", unmapped_deallocation_bytes=" << unmapped_deallocation_bytes;
|
||||
|
||||
// Using pair of (index, MemoryActivityMetadata*) so that we can sort by the
|
||||
// metadata, and fetch metadata by indexing the time-sorted snapshots at
|
||||
// frontend.
|
||||
std::vector<IndexMetaPair> active_allocs;
|
||||
for (const auto& address_and_index_meta : active_alloc_map) {
|
||||
active_allocs.push_back(address_and_index_meta.second);
|
||||
}
|
||||
|
||||
InsertSpecialAllocations(unmapped_allocation_bytes,
|
||||
peak_bytes_profile_step_id, memory_profile,
|
||||
&active_allocs);
|
||||
|
||||
std::sort(active_allocs.begin(), active_allocs.end(), MetadataComparator());
|
||||
|
||||
// Fill the sorted active_allocations proto messages at peak memory usage.
|
||||
// Merge identical allocations and show occurrences.
|
||||
for (int i = 0; i < active_allocs.size(); i++) {
|
||||
ActiveAllocation* allocation = memory_profile->add_active_allocations();
|
||||
allocation->set_snapshot_index(active_allocs[i].first);
|
||||
if (active_allocs[i].first < 0) {
|
||||
allocation->set_special_index(-active_allocs[i].first - 1);
|
||||
} else {
|
||||
allocation->set_special_index(-1);
|
||||
}
|
||||
allocation->set_num_occurrences(1);
|
||||
while (i < active_allocs.size() - 1 &&
|
||||
active_allocs[i] == active_allocs[i + 1]) {
|
||||
allocation->set_num_occurrences(allocation->num_occurrences() + 1);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
|
||||
VLOG(2) << "Distinctive active allocation count="
|
||||
<< memory_profile->active_allocations_size();
|
||||
}
|
||||
|
||||
// Post-process the memory profile to correctly update proto fields, and break
|
||||
// down peak memory usage for each allocator.
|
||||
void ProcessMemoryProfileProto(MemoryProfile* memory_profile) {
|
||||
memory_profile->set_num_hosts(1);
|
||||
// Add sorted memory ids within memory profile data to the selection list.
|
||||
for (const auto& id_and_allocator_profile :
|
||||
memory_profile->memory_profile_per_allocator()) {
|
||||
if (!id_and_allocator_profile.second.memory_profile_snapshots().empty()) {
|
||||
memory_profile->add_memory_ids(id_and_allocator_profile.first);
|
||||
}
|
||||
}
|
||||
absl::c_sort(*memory_profile->mutable_memory_ids());
|
||||
|
||||
for (auto& id_and_allocator_profile :
|
||||
*memory_profile->mutable_memory_profile_per_allocator()) {
|
||||
PerAllocatorMemoryProfile* allocator_memory_profile =
|
||||
&id_and_allocator_profile.second;
|
||||
// Sort the memory_profile_snapshots by time_offset_ps (ascending) in proto.
|
||||
absl::c_sort(
|
||||
*allocator_memory_profile->mutable_memory_profile_snapshots(),
|
||||
[](const MemoryProfileSnapshot& a, const MemoryProfileSnapshot& b) {
|
||||
return a.time_offset_ps() < b.time_offset_ps();
|
||||
});
|
||||
|
||||
UpdateStepId(memory_profile->step_count(), allocator_memory_profile);
|
||||
UpdateDeallocation(allocator_memory_profile);
|
||||
|
||||
int64 peak_bytes_profile = allocator_memory_profile->profile_summary()
|
||||
.peak_stats()
|
||||
.peak_bytes_in_use();
|
||||
int64 peak_step_id =
|
||||
GetPeakMemoryStep(peak_bytes_profile, allocator_memory_profile);
|
||||
ProcessActiveAllocations(peak_step_id, allocator_memory_profile);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane) {
|
||||
MemoryProfile memory_profile = GenerateMemoryProfile(&host_plane);
|
||||
ProcessMemoryProfileProto(&memory_profile);
|
||||
return memory_profile;
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
31
tensorflow/core/profiler/convert/xplane_to_memory_profile.h
Normal file
31
tensorflow/core/profiler/convert/xplane_to_memory_profile.h
Normal file
@ -0,0 +1,31 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
|
||||
#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
|
||||
|
||||
#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
|
||||
// Process the host threads XPlane and generate MemoryProfile result.
|
||||
MemoryProfile ConvertXPlaneToMemoryProfile(const XPlane& host_plane);
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
||||
|
||||
#endif // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
|
@ -0,0 +1,117 @@
|
||||
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
==============================================================================*/
|
||||
|
||||
#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
|
||||
|
||||
#include "tensorflow/core/platform/test.h"
|
||||
#include "tensorflow/core/profiler/utils/xplane_builder.h"
|
||||
#include "tensorflow/core/profiler/utils/xplane_schema.h"
|
||||
#include "tensorflow/core/profiler/utils/xplane_utils.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
namespace {
|
||||
|
||||
// Tests with a sample profile with multiple memory allocation and deallocation
|
||||
// activities within one memory allocator captured in host trace.
|
||||
TEST(ConvertXPlaneToMemoryProfile, OneAllocatorMultiActivitiesTest) {
|
||||
XSpace space;
|
||||
XPlane* host_plane = space.add_planes();
|
||||
XPlaneBuilder host_plane_builder(host_plane);
|
||||
host_plane_builder.SetName(kHostThreads);
|
||||
host_plane_builder.ReserveLines(1);
|
||||
|
||||
auto tf_executor_thread = host_plane_builder.GetOrCreateLine(0);
|
||||
CreateXEventWithIntAndStringViewMetadataValue(
|
||||
&host_plane_builder, &tf_executor_thread, "MemoryAllocation", 40000, 1000,
|
||||
{{StatType::kBytesReserved, 2000},
|
||||
{StatType::kBytesAllocated, 3000},
|
||||
{StatType::kBytesAvailable, 5000},
|
||||
{StatType::kPeakBytesInUse, 8500},
|
||||
{StatType::kRequestedBytes, 200},
|
||||
{StatType::kAllocationBytes, 256},
|
||||
{StatType::kAddress, 222333},
|
||||
{StatType::kStepId, -93746},
|
||||
{StatType::kDataType, 1}},
|
||||
{{StatType::kAllocatorName, "GPU_0_bfc"},
|
||||
{StatType::kTfOp, "foo/bar"},
|
||||
{StatType::kRegionType, "output"},
|
||||
{StatType::kTensorShapes, "[3, 3, 512, 512]"}});
|
||||
|
||||
CreateXEventWithIntAndStringViewMetadataValue(
|
||||
&host_plane_builder, &tf_executor_thread, "MemoryDeallocation", 50000,
|
||||
1000,
|
||||
{{StatType::kBytesReserved, 2000},
|
||||
{StatType::kBytesAllocated, 2744},
|
||||
{StatType::kBytesAvailable, 5256},
|
||||
{StatType::kPeakBytesInUse, 8500},
|
||||
{StatType::kRequestedBytes, 200},
|
||||
{StatType::kAllocationBytes, 256},
|
||||
{StatType::kAddress, 222333},
|
||||
{StatType::kStepId, 0},
|
||||
{StatType::kDataType, 0}},
|
||||
{{StatType::kAllocatorName, "GPU_0_bfc"},
|
||||
{StatType::kRegionType, ""},
|
||||
{StatType::kTensorShapes, ""}});
|
||||
|
||||
CreateXEventWithIntAndStringViewMetadataValue(
|
||||
&host_plane_builder, &tf_executor_thread, "MemoryAllocation", 70000, 1000,
|
||||
{{StatType::kBytesReserved, 2000},
|
||||
{StatType::kBytesAllocated, 5000},
|
||||
{StatType::kBytesAvailable, 3000},
|
||||
{StatType::kPeakBytesInUse, 9500},
|
||||
{StatType::kRequestedBytes, 300},
|
||||
{StatType::kAllocationBytes, 300},
|
||||
{StatType::kAddress, 345678},
|
||||
{StatType::kStepId, -93746},
|
||||
{StatType::kDataType, 9}},
|
||||
{{StatType::kAllocatorName, "GPU_0_bfc"},
|
||||
{StatType::kTfOp, "mul_grad/Sum"},
|
||||
{StatType::kRegionType, "temp"},
|
||||
{StatType::kTensorShapes, "[1, 2]"}});
|
||||
|
||||
MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
|
||||
EXPECT_EQ(memory_profile.memory_profile_per_allocator().size(), 1);
|
||||
EXPECT_EQ(memory_profile.num_hosts(), 1);
|
||||
EXPECT_EQ(memory_profile.memory_ids_size(), 1);
|
||||
EXPECT_EQ(memory_profile.step_count().size(), 1);
|
||||
EXPECT_EQ(memory_profile.memory_profile_per_allocator().begin()->first,
|
||||
"GPU_0_bfc");
|
||||
const auto& allocator_memory_profile =
|
||||
memory_profile.memory_profile_per_allocator().begin()->second;
|
||||
EXPECT_EQ(
|
||||
allocator_memory_profile.profile_summary().peak_bytes_usage_lifetime(),
|
||||
9500);
|
||||
EXPECT_EQ(allocator_memory_profile.profile_summary()
|
||||
.peak_stats()
|
||||
.peak_bytes_in_use(),
|
||||
7000);
|
||||
EXPECT_EQ(allocator_memory_profile.profile_summary().peak_stats_time_ps(),
|
||||
70000);
|
||||
EXPECT_EQ(allocator_memory_profile.memory_profile_snapshots_size(), 3);
|
||||
EXPECT_EQ(allocator_memory_profile.active_allocations_size(), 3);
|
||||
EXPECT_EQ(
|
||||
allocator_memory_profile.active_allocations().at(2).snapshot_index(), 2);
|
||||
EXPECT_EQ(allocator_memory_profile.special_allocations_size(), 2);
|
||||
EXPECT_EQ(allocator_memory_profile.special_allocations().at(1).tf_op_name(),
|
||||
"stack");
|
||||
EXPECT_EQ(
|
||||
allocator_memory_profile.special_allocations().at(1).allocation_bytes(),
|
||||
2000);
|
||||
}
|
||||
|
||||
} // namespace
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
@ -21,17 +21,21 @@ limitations under the License.
|
||||
#include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
|
||||
#include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
|
||||
#include "tensorflow/core/profiler/convert/trace_events_to_json.h"
|
||||
#include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
|
||||
#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
|
||||
#include "tensorflow/core/profiler/convert/xplane_to_trace_events.h"
|
||||
#include "tensorflow/core/profiler/profiler_service.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
|
||||
#include "tensorflow/core/profiler/rpc/client/save_profile.h"
|
||||
#include "tensorflow/core/profiler/utils/xplane_schema.h"
|
||||
#include "tensorflow/core/profiler/utils/xplane_utils.h"
|
||||
|
||||
namespace tensorflow {
|
||||
namespace profiler {
|
||||
@ -42,6 +46,7 @@ const absl::string_view kTensorflowStats = "tensorflow_stats";
|
||||
const absl::string_view kInputPipeline = "input_pipeline";
|
||||
const absl::string_view kOverviewPage = "overview_page";
|
||||
const absl::string_view kKernelStats = "kernel_stats";
|
||||
const absl::string_view kMemoryProfile = "memory_profile";
|
||||
|
||||
HardwareType HardwareTypeFromRunEnvironment(const RunEnvironment& run_env) {
|
||||
if (run_env.device_type() == "GPU") return HardwareType::GPU;
|
||||
@ -107,6 +112,12 @@ Status ConvertXSpaceToProfileResponse(const XSpace& xspace,
|
||||
if (tools.contains(kKernelStats)) {
|
||||
AddToolData(ToolName(kKernelStats), op_stats.kernel_stats_db(), response);
|
||||
}
|
||||
if (tools.contains(kMemoryProfile)) {
|
||||
if (const XPlane* host_plane = FindPlaneWithName(xspace, kHostThreads)) {
|
||||
MemoryProfile memory_profile = ConvertXPlaneToMemoryProfile(*host_plane);
|
||||
AddToolData(ToolName(kMemoryProfile), memory_profile, response);
|
||||
}
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -122,4 +122,7 @@ message MemoryProfile {
|
||||
// Ids for profiled memory allocators, used to populate memory selection list
|
||||
// at front end.
|
||||
repeated string memory_ids = 3;
|
||||
// Map of original random int64 step id to the count of memory activity events
|
||||
// assigned with it.
|
||||
map<int64 /*orig_step_id*/, int64 /*count*/> step_count = 4;
|
||||
}
|
||||
|
@ -171,6 +171,29 @@ XEventBuilder CreateXEventWithStringViewMetadataValue(
|
||||
return event_builder;
|
||||
}
|
||||
|
||||
XEventBuilder CreateXEventWithIntAndStringViewMetadataValue(
|
||||
XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
|
||||
absl::string_view event_name, int64 offset_ps, int64 duration_ps,
|
||||
const absl::flat_hash_map<StatType, int64 /*stat_value*/>& int_stats,
|
||||
const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
|
||||
str_stats) {
|
||||
auto event_builder = line_builder->AddEvent(
|
||||
*plane_builder->GetOrCreateEventMetadata(event_name));
|
||||
event_builder.SetOffsetPs(offset_ps);
|
||||
event_builder.SetDurationPs(duration_ps);
|
||||
for (const auto& stat_type_and_value : int_stats) {
|
||||
event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
|
||||
GetStatTypeStr(stat_type_and_value.first)),
|
||||
stat_type_and_value.second);
|
||||
}
|
||||
for (const auto& stat_type_and_value : str_stats) {
|
||||
event_builder.AddStatValue(*plane_builder->GetOrCreateStatMetadata(
|
||||
GetStatTypeStr(stat_type_and_value.first)),
|
||||
stat_type_and_value.second);
|
||||
}
|
||||
return event_builder;
|
||||
}
|
||||
|
||||
void RemovePlaneWithName(XSpace* space, absl::string_view name) {
|
||||
auto* planes = space->mutable_planes();
|
||||
planes->erase(
|
||||
|
@ -63,6 +63,14 @@ XEventBuilder CreateXEventWithStringViewMetadataValue(
|
||||
const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
|
||||
stats);
|
||||
|
||||
// Creates an XEvent with int64 and string stats.
|
||||
XEventBuilder CreateXEventWithIntAndStringViewMetadataValue(
|
||||
XPlaneBuilder* plane_builder, XLineBuilder* line_builder,
|
||||
absl::string_view event_name, int64 offset_ps, int64 duration_ps,
|
||||
const absl::flat_hash_map<StatType, int64 /*stat_value*/>& int_stats,
|
||||
const absl::flat_hash_map<StatType, absl::string_view /*stat_value*/>&
|
||||
str_stats);
|
||||
|
||||
void RemovePlaneWithName(XSpace* space, absl::string_view name);
|
||||
void RemoveEmptyPlanes(XSpace* space);
|
||||
void RemoveEmptyLines(XPlane* plane);
|
||||
|
Loading…
Reference in New Issue
Block a user