Break dependency of profiler_backends on core:core_cpu_lib
PiperOrigin-RevId: 298190813 Change-Id: Ic26fbdf6f5f3971503689c295fa58093901afb27
This commit is contained in:
parent
8a8683e000
commit
626d4a0479
tensorflow/core
@ -16,7 +16,8 @@ limitations under the License.
|
||||
#include <vector>
|
||||
|
||||
#include "absl/strings/str_split.h"
|
||||
#include "tensorflow/core/common_runtime/step_stats_collector.h"
|
||||
#include "tensorflow/core/framework/step_stats.pb.h"
|
||||
#include "tensorflow/core/lib/core/errors.h"
|
||||
#include "tensorflow/core/lib/core/status.h"
|
||||
#include "tensorflow/core/platform/env_time.h"
|
||||
#include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
|
||||
@ -77,11 +78,11 @@ HostTracer::~HostTracer() { Stop().IgnoreError(); }
|
||||
|
||||
Status HostTracer::Start() {
|
||||
if (recording_) {
|
||||
return Status(error::INTERNAL, "TraceMeRecorder already started");
|
||||
return errors::Internal("TraceMeRecorder already started");
|
||||
}
|
||||
recording_ = TraceMeRecorder::Start(host_trace_level_);
|
||||
if (!recording_) {
|
||||
return Status(error::INTERNAL, "Failed to start TraceMeRecorder");
|
||||
return errors::Internal("Failed to start TraceMeRecorder");
|
||||
}
|
||||
start_timestamp_ns_ = EnvTime::NowNanos();
|
||||
return Status::OK();
|
||||
@ -89,7 +90,7 @@ Status HostTracer::Start() {
|
||||
|
||||
Status HostTracer::Stop() {
|
||||
if (!recording_) {
|
||||
return Status(error::INTERNAL, "TraceMeRecorder not started");
|
||||
return errors::Internal("TraceMeRecorder not started");
|
||||
}
|
||||
events_ = TraceMeRecorder::Stop();
|
||||
recording_ = false;
|
||||
@ -101,16 +102,18 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
|
||||
return errors::Internal("TraceMeRecorder not stopped");
|
||||
}
|
||||
MakeCompleteEvents(&events_);
|
||||
StepStatsCollector step_stats_collector(run_metadata->mutable_step_stats());
|
||||
|
||||
StepStats* step_stats = run_metadata->mutable_step_stats();
|
||||
DeviceStepStats* dev_stats = step_stats->add_dev_stats();
|
||||
dev_stats->set_device("/host:CPU");
|
||||
auto* thread_names = dev_stats->mutable_thread_names();
|
||||
|
||||
constexpr char kUserMetadataMarker = '#';
|
||||
const string cpu_name = "/host:CPU";
|
||||
for (auto& thread : events_) {
|
||||
step_stats_collector.SaveThreadName(cpu_name, thread.thread.tid,
|
||||
thread.thread.name);
|
||||
for (auto& event : thread.events) {
|
||||
for (TraceMeRecorder::ThreadEvents& thread : events_) {
|
||||
thread_names->insert({thread.thread.tid, thread.thread.name});
|
||||
for (TraceMeRecorder::Event& event : thread.events) {
|
||||
if (event.start_time && event.end_time) {
|
||||
NodeExecStats* ns = new NodeExecStats;
|
||||
NodeExecStats* ns = dev_stats->add_node_stats();
|
||||
if (event.name.back() != kUserMetadataMarker) {
|
||||
ns->set_node_name(std::move(event.name));
|
||||
} else {
|
||||
@ -128,12 +131,10 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
|
||||
ns->set_all_end_rel_micros((event.end_time - event.start_time) /
|
||||
EnvTime::kMicrosToNanos);
|
||||
ns->set_thread_id(thread.thread.tid);
|
||||
step_stats_collector.Save(cpu_name, ns);
|
||||
}
|
||||
}
|
||||
}
|
||||
events_.clear();
|
||||
step_stats_collector.Finalize();
|
||||
return Status::OK();
|
||||
}
|
||||
|
||||
|
@ -17,7 +17,6 @@ limitations under the License.
|
||||
#include <gmock/gmock.h>
|
||||
#include <gtest/gtest.h>
|
||||
#include "absl/types/optional.h"
|
||||
#include "tensorflow/core/common_runtime/step_stats_collector.h"
|
||||
#include "tensorflow/core/framework/step_stats.pb.h"
|
||||
#include "tensorflow/core/lib/core/status_test_util.h"
|
||||
#include "tensorflow/core/platform/env.h"
|
||||
|
@ -31,7 +31,6 @@ tf_cuda_library(
|
||||
],
|
||||
deps = [
|
||||
":cupti_utils",
|
||||
"//tensorflow/core:core_cpu_internal",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:protos_all_cc",
|
||||
"//tensorflow/core/profiler/internal:annotation_stack",
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
#include <stdlib.h>
|
||||
|
||||
#include <memory>
|
||||
#include <utility>
|
||||
|
||||
#include "absl/container/fixed_array.h"
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
@ -25,10 +26,13 @@ limitations under the License.
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/strings/str_join.h"
|
||||
#include "tensorflow/core/common_runtime/step_stats_collector.h"
|
||||
#include "tensorflow/core/framework/step_stats.pb.h"
|
||||
#include "tensorflow/core/lib/core/errors.h"
|
||||
#include "tensorflow/core/platform/abi.h"
|
||||
#include "tensorflow/core/platform/env_time.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/thread_annotations.h"
|
||||
#include "tensorflow/core/profiler/internal/annotation_stack.h"
|
||||
#include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
|
||||
#include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h"
|
||||
@ -199,13 +203,13 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
|
||||
}
|
||||
void OnEventsDropped(const std::string& reason, uint32 num_events) override {}
|
||||
void Flush() override {}
|
||||
void Export(StepStatsCollector* trace_collector) {
|
||||
void Export(StepStats* step_stats) {
|
||||
LOG(INFO) << " GpuTracer has collected " << num_callback_events_
|
||||
<< " callback api events and " << num_activity_events_
|
||||
<< " activity events.";
|
||||
for (int i = 0; i < num_gpus_; ++i) {
|
||||
per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_,
|
||||
trace_collector);
|
||||
step_stats);
|
||||
}
|
||||
}
|
||||
void Export(XSpace* space) {
|
||||
@ -254,7 +258,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
|
||||
};
|
||||
struct PerDeviceCollector {
|
||||
void AddEvent(CuptiTracerEvent&& event) {
|
||||
absl::MutexLock lock(&mutex);
|
||||
mutex_lock l(m);
|
||||
if (event.source == CuptiTracerEventSource::DriverCallback) {
|
||||
// Cupti api callback events were used to populate launch times etc.
|
||||
if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
|
||||
@ -270,12 +274,16 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
|
||||
}
|
||||
|
||||
void Flush(int32 device_ordinal, uint64 start_walltime_ns,
|
||||
uint64 start_gpu_ns, StepStatsCollector* collector) {
|
||||
absl::MutexLock lock(&mutex);
|
||||
stream_device = absl::StrCat("/device:GPU:", device_ordinal, "/stream:");
|
||||
memcpy_device = absl::StrCat("/device:GPU:", device_ordinal, "/memcpy");
|
||||
sync_device = absl::StrCat("/device:GPU:", device_ordinal, "/sync");
|
||||
for (auto& event : events) {
|
||||
uint64 start_gpu_ns, StepStats* step_stats) {
|
||||
mutex_lock l(m);
|
||||
absl::flat_hash_map<std::pair<int64 /*stream_id*/, CuptiTracerEventType>,
|
||||
DeviceStepStats*>
|
||||
stream_dev_stats_map;
|
||||
DeviceStepStats* unknown_stream_dev_stats = nullptr;
|
||||
DeviceStepStats* all_streams_dev_stats = nullptr;
|
||||
DeviceStepStats* memcpy_dev_stats = nullptr;
|
||||
DeviceStepStats* sync_dev_stats = nullptr;
|
||||
for (const CuptiTracerEvent& event : events) {
|
||||
NodeExecStats* ns = new NodeExecStats;
|
||||
ns->set_all_start_micros(
|
||||
(start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000);
|
||||
@ -291,7 +299,12 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
|
||||
ns->set_node_name(event.name);
|
||||
ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
|
||||
ns->set_thread_id(event.thread_id);
|
||||
collector->Save(sync_device, ns);
|
||||
if (sync_dev_stats == nullptr) {
|
||||
sync_dev_stats = step_stats->add_dev_stats();
|
||||
sync_dev_stats->set_device(
|
||||
absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
|
||||
}
|
||||
sync_dev_stats->add_node_stats()->Swap(ns);
|
||||
}
|
||||
} else { // CuptiTracerEventSource::Activity
|
||||
// Get launch information if available.
|
||||
@ -312,19 +325,30 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
|
||||
ns->set_node_name(activity_name);
|
||||
switch (event.type) {
|
||||
case CuptiTracerEventType::Kernel: {
|
||||
const std::string details = absl::StrFormat(
|
||||
"regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u",
|
||||
event.kernel_info.registers_per_thread,
|
||||
ns->set_timeline_label(absl::StrFormat(
|
||||
"%s regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u@@%s",
|
||||
kernel_name, event.kernel_info.registers_per_thread,
|
||||
event.kernel_info.static_shared_memory_usage,
|
||||
event.kernel_info.grid_x, event.kernel_info.grid_y,
|
||||
event.kernel_info.grid_z, event.kernel_info.block_x,
|
||||
event.kernel_info.block_y, event.kernel_info.block_z);
|
||||
ns->set_timeline_label(absl::StrCat(kernel_name, " ", details,
|
||||
"@@", event.annotation));
|
||||
auto nscopy = new NodeExecStats(*ns);
|
||||
collector->Save(absl::StrCat(stream_device, "all"), ns);
|
||||
collector->Save(absl::StrCat(stream_device, event.stream_id),
|
||||
nscopy);
|
||||
event.kernel_info.block_y, event.kernel_info.block_z,
|
||||
event.annotation));
|
||||
DeviceStepStats*& stream_dev_stats =
|
||||
stream_dev_stats_map[std::make_pair(event.stream_id,
|
||||
event.type)];
|
||||
if (stream_dev_stats == nullptr) {
|
||||
stream_dev_stats = step_stats->add_dev_stats();
|
||||
stream_dev_stats->set_device(
|
||||
absl::StrCat("/device:GPU:", device_ordinal,
|
||||
"/stream:", event.stream_id));
|
||||
}
|
||||
*stream_dev_stats->add_node_stats() = *ns;
|
||||
if (all_streams_dev_stats == nullptr) {
|
||||
all_streams_dev_stats = step_stats->add_dev_stats();
|
||||
all_streams_dev_stats->set_device(absl::StrCat(
|
||||
"/device:GPU:", device_ordinal, "/stream:all"));
|
||||
}
|
||||
all_streams_dev_stats->add_node_stats()->Swap(ns);
|
||||
break;
|
||||
}
|
||||
case CuptiTracerEventType::MemcpyH2D:
|
||||
@ -341,17 +365,33 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
|
||||
" to device:", event.memcpy_info.destination);
|
||||
}
|
||||
ns->set_timeline_label(std::move(details));
|
||||
auto nscopy = new NodeExecStats(*ns);
|
||||
collector->Save(memcpy_device, ns);
|
||||
collector->Save(
|
||||
absl::StrCat(stream_device, event.stream_id, "<",
|
||||
GetTraceEventTypeName(event.type), ">"),
|
||||
nscopy);
|
||||
DeviceStepStats*& stream_dev_stats =
|
||||
stream_dev_stats_map[std::make_pair(event.stream_id,
|
||||
event.type)];
|
||||
if (stream_dev_stats == nullptr) {
|
||||
stream_dev_stats = step_stats->add_dev_stats();
|
||||
stream_dev_stats->set_device(absl::StrCat(
|
||||
"/device:GPU:", device_ordinal, "/stream:", event.stream_id,
|
||||
"<", GetTraceEventTypeName(event.type), ">"));
|
||||
}
|
||||
*stream_dev_stats->add_node_stats() = *ns;
|
||||
if (memcpy_dev_stats == nullptr) {
|
||||
memcpy_dev_stats = step_stats->add_dev_stats();
|
||||
memcpy_dev_stats->set_device(
|
||||
absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
|
||||
}
|
||||
memcpy_dev_stats->add_node_stats()->Swap(ns);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
ns->set_timeline_label(activity_name);
|
||||
collector->Save(stream_device, ns);
|
||||
if (unknown_stream_dev_stats == nullptr) {
|
||||
unknown_stream_dev_stats = step_stats->add_dev_stats();
|
||||
unknown_stream_dev_stats->set_device(
|
||||
absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
|
||||
}
|
||||
unknown_stream_dev_stats->add_node_stats()->Swap(ns);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -360,8 +400,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
|
||||
|
||||
void Flush(uint64 start_gpu_ns, uint64 end_gpu_ns,
|
||||
XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
|
||||
absl::MutexLock lock(&mutex);
|
||||
|
||||
mutex_lock l(m);
|
||||
// Tracking event types per line.
|
||||
absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>>
|
||||
events_types_per_line;
|
||||
@ -449,13 +488,9 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
|
||||
}
|
||||
}
|
||||
|
||||
absl::Mutex mutex;
|
||||
std::string stream_device GUARDED_BY(mutex);
|
||||
std::string memcpy_device GUARDED_BY(mutex);
|
||||
std::string sync_device GUARDED_BY(mutex);
|
||||
std::vector<CuptiTracerEvent> events GUARDED_BY(mutex);
|
||||
absl::flat_hash_map<uint32, CorrelationInfo> correlation_info
|
||||
GUARDED_BY(mutex);
|
||||
mutex m;
|
||||
std::vector<CuptiTracerEvent> events GUARDED_BY(m);
|
||||
absl::flat_hash_map<uint32, CorrelationInfo> correlation_info GUARDED_BY(m);
|
||||
};
|
||||
absl::FixedArray<PerDeviceCollector> per_device_collector_;
|
||||
|
||||
@ -495,7 +530,6 @@ class GpuTracer : public profiler::ProfilerInterface {
|
||||
|
||||
CuptiTracer* cupti_tracer_;
|
||||
CuptiTracerOptions options_;
|
||||
StepStats step_stats_;
|
||||
std::unique_ptr<CuptiTraceCollectorImpl> cupti_collector_;
|
||||
};
|
||||
|
||||
@ -605,12 +639,11 @@ Status GpuTracer::CollectData(RunMetadata* run_metadata) {
|
||||
return Status::OK();
|
||||
case State::kStoppedOk: {
|
||||
// Input run_metadata is shared by profiler interfaces, we need append.
|
||||
StepStatsCollector step_stats_collector(&step_stats_);
|
||||
StepStats step_stats;
|
||||
if (cupti_collector_) {
|
||||
cupti_collector_->Export(&step_stats_collector);
|
||||
cupti_collector_->Export(&step_stats);
|
||||
}
|
||||
step_stats_collector.Finalize();
|
||||
for (auto& dev_stats : *step_stats_.mutable_dev_stats()) {
|
||||
for (auto& dev_stats : *step_stats.mutable_dev_stats()) {
|
||||
run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats);
|
||||
}
|
||||
return Status::OK();
|
||||
|
@ -20,7 +20,6 @@ limitations under the License.
|
||||
#include <vector>
|
||||
|
||||
#include "tensorflow/core/common_runtime/direct_session.h"
|
||||
#include "tensorflow/core/common_runtime/step_stats_collector.h"
|
||||
#include "tensorflow/core/framework/allocator.h"
|
||||
#include "tensorflow/core/framework/graph.pb.h"
|
||||
#include "tensorflow/core/framework/tensor.h"
|
||||
|
@ -58,7 +58,7 @@ class ProfilerInterface {
|
||||
// Stops profiling.
|
||||
virtual Status Stop() = 0;
|
||||
|
||||
// Saves collected profile data into step_stats_collector.
|
||||
// Saves collected profile data into run_metadata.
|
||||
// After this or the overload below are called once, subsequent calls might
|
||||
// return empty data.
|
||||
virtual Status CollectData(RunMetadata* run_metadata) = 0;
|
||||
|
@ -132,7 +132,10 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
|
||||
int64 mem_total = 0;
|
||||
|
||||
int64 first_node_start_us =
|
||||
step_stats.dev_stats(0).node_stats(0).all_start_micros();
|
||||
(step_stats.dev_stats_size() > 0 &&
|
||||
step_stats.dev_stats(0).node_stats_size() > 0)
|
||||
? step_stats.dev_stats(0).node_stats(0).all_start_micros()
|
||||
: 0;
|
||||
|
||||
int node_num = 0;
|
||||
for (const auto& ds : step_stats.dev_stats()) {
|
||||
|
Loading…
Reference in New Issue
Block a user