Break dependency of profiler_backends on core:core_cpu_lib

PiperOrigin-RevId: 298190813
Change-Id: Ic26fbdf6f5f3971503689c295fa58093901afb27
This commit is contained in:
Jose Baiocchi 2020-03-01 07:55:24 -08:00 committed by TensorFlower Gardener
parent 8a8683e000
commit 626d4a0479
7 changed files with 94 additions and 60 deletions

View File

@ -16,7 +16,8 @@ limitations under the License.
#include <vector>
#include "absl/strings/str_split.h"
#include "tensorflow/core/common_runtime/step_stats_collector.h"
#include "tensorflow/core/framework/step_stats.pb.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/lib/core/status.h"
#include "tensorflow/core/platform/env_time.h"
#include "tensorflow/core/profiler/internal/cpu/host_tracer_utils.h"
@ -77,11 +78,11 @@ HostTracer::~HostTracer() { Stop().IgnoreError(); }
Status HostTracer::Start() {
if (recording_) {
return Status(error::INTERNAL, "TraceMeRecorder already started");
return errors::Internal("TraceMeRecorder already started");
}
recording_ = TraceMeRecorder::Start(host_trace_level_);
if (!recording_) {
return Status(error::INTERNAL, "Failed to start TraceMeRecorder");
return errors::Internal("Failed to start TraceMeRecorder");
}
start_timestamp_ns_ = EnvTime::NowNanos();
return Status::OK();
@ -89,7 +90,7 @@ Status HostTracer::Start() {
Status HostTracer::Stop() {
if (!recording_) {
return Status(error::INTERNAL, "TraceMeRecorder not started");
return errors::Internal("TraceMeRecorder not started");
}
events_ = TraceMeRecorder::Stop();
recording_ = false;
@ -101,16 +102,18 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
return errors::Internal("TraceMeRecorder not stopped");
}
MakeCompleteEvents(&events_);
StepStatsCollector step_stats_collector(run_metadata->mutable_step_stats());
StepStats* step_stats = run_metadata->mutable_step_stats();
DeviceStepStats* dev_stats = step_stats->add_dev_stats();
dev_stats->set_device("/host:CPU");
auto* thread_names = dev_stats->mutable_thread_names();
constexpr char kUserMetadataMarker = '#';
const string cpu_name = "/host:CPU";
for (auto& thread : events_) {
step_stats_collector.SaveThreadName(cpu_name, thread.thread.tid,
thread.thread.name);
for (auto& event : thread.events) {
for (TraceMeRecorder::ThreadEvents& thread : events_) {
thread_names->insert({thread.thread.tid, thread.thread.name});
for (TraceMeRecorder::Event& event : thread.events) {
if (event.start_time && event.end_time) {
NodeExecStats* ns = new NodeExecStats;
NodeExecStats* ns = dev_stats->add_node_stats();
if (event.name.back() != kUserMetadataMarker) {
ns->set_node_name(std::move(event.name));
} else {
@ -128,12 +131,10 @@ Status HostTracer::CollectData(RunMetadata* run_metadata) {
ns->set_all_end_rel_micros((event.end_time - event.start_time) /
EnvTime::kMicrosToNanos);
ns->set_thread_id(thread.thread.tid);
step_stats_collector.Save(cpu_name, ns);
}
}
}
events_.clear();
step_stats_collector.Finalize();
return Status::OK();
}

View File

@ -17,7 +17,6 @@ limitations under the License.
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include "absl/types/optional.h"
#include "tensorflow/core/common_runtime/step_stats_collector.h"
#include "tensorflow/core/framework/step_stats.pb.h"
#include "tensorflow/core/lib/core/status_test_util.h"
#include "tensorflow/core/platform/env.h"

View File

@ -31,7 +31,6 @@ tf_cuda_library(
],
deps = [
":cupti_utils",
"//tensorflow/core:core_cpu_internal",
"//tensorflow/core:lib",
"//tensorflow/core:protos_all_cc",
"//tensorflow/core/profiler/internal:annotation_stack",

View File

@ -18,6 +18,7 @@ limitations under the License.
#include <stdlib.h>
#include <memory>
#include <utility>
#include "absl/container/fixed_array.h"
#include "absl/container/flat_hash_map.h"
@ -25,10 +26,13 @@ limitations under the License.
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/str_join.h"
#include "tensorflow/core/common_runtime/step_stats_collector.h"
#include "tensorflow/core/framework/step_stats.pb.h"
#include "tensorflow/core/lib/core/errors.h"
#include "tensorflow/core/platform/abi.h"
#include "tensorflow/core/platform/env_time.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/thread_annotations.h"
#include "tensorflow/core/profiler/internal/annotation_stack.h"
#include "tensorflow/core/profiler/internal/gpu/cupti_tracer.h"
#include "tensorflow/core/profiler/internal/gpu/cupti_wrapper.h"
@ -199,13 +203,13 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
}
void OnEventsDropped(const std::string& reason, uint32 num_events) override {}
void Flush() override {}
void Export(StepStatsCollector* trace_collector) {
void Export(StepStats* step_stats) {
LOG(INFO) << " GpuTracer has collected " << num_callback_events_
<< " callback api events and " << num_activity_events_
<< " activity events.";
for (int i = 0; i < num_gpus_; ++i) {
per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_,
trace_collector);
step_stats);
}
}
void Export(XSpace* space) {
@ -254,7 +258,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
};
struct PerDeviceCollector {
void AddEvent(CuptiTracerEvent&& event) {
absl::MutexLock lock(&mutex);
mutex_lock l(m);
if (event.source == CuptiTracerEventSource::DriverCallback) {
// Cupti api callback events were used to populate launch times etc.
if (event.correlation_id != CuptiTracerEvent::kInvalidCorrelationId) {
@ -270,12 +274,16 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
}
void Flush(int32 device_ordinal, uint64 start_walltime_ns,
uint64 start_gpu_ns, StepStatsCollector* collector) {
absl::MutexLock lock(&mutex);
stream_device = absl::StrCat("/device:GPU:", device_ordinal, "/stream:");
memcpy_device = absl::StrCat("/device:GPU:", device_ordinal, "/memcpy");
sync_device = absl::StrCat("/device:GPU:", device_ordinal, "/sync");
for (auto& event : events) {
uint64 start_gpu_ns, StepStats* step_stats) {
mutex_lock l(m);
absl::flat_hash_map<std::pair<int64 /*stream_id*/, CuptiTracerEventType>,
DeviceStepStats*>
stream_dev_stats_map;
DeviceStepStats* unknown_stream_dev_stats = nullptr;
DeviceStepStats* all_streams_dev_stats = nullptr;
DeviceStepStats* memcpy_dev_stats = nullptr;
DeviceStepStats* sync_dev_stats = nullptr;
for (const CuptiTracerEvent& event : events) {
NodeExecStats* ns = new NodeExecStats;
ns->set_all_start_micros(
(start_walltime_ns + (event.start_time_ns - start_gpu_ns)) / 1000);
@ -291,7 +299,12 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
ns->set_node_name(event.name);
ns->set_timeline_label(absl::StrCat("ThreadId ", event.thread_id));
ns->set_thread_id(event.thread_id);
collector->Save(sync_device, ns);
if (sync_dev_stats == nullptr) {
sync_dev_stats = step_stats->add_dev_stats();
sync_dev_stats->set_device(
absl::StrCat("/device:GPU:", device_ordinal, "/sync"));
}
sync_dev_stats->add_node_stats()->Swap(ns);
}
} else { // CuptiTracerEventSource::Activity
// Get launch information if available.
@ -312,19 +325,30 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
ns->set_node_name(activity_name);
switch (event.type) {
case CuptiTracerEventType::Kernel: {
const std::string details = absl::StrFormat(
"regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u",
event.kernel_info.registers_per_thread,
ns->set_timeline_label(absl::StrFormat(
"%s regs:%u shm:%u grid:%u,%u,%u block:%u,%u,%u@@%s",
kernel_name, event.kernel_info.registers_per_thread,
event.kernel_info.static_shared_memory_usage,
event.kernel_info.grid_x, event.kernel_info.grid_y,
event.kernel_info.grid_z, event.kernel_info.block_x,
event.kernel_info.block_y, event.kernel_info.block_z);
ns->set_timeline_label(absl::StrCat(kernel_name, " ", details,
"@@", event.annotation));
auto nscopy = new NodeExecStats(*ns);
collector->Save(absl::StrCat(stream_device, "all"), ns);
collector->Save(absl::StrCat(stream_device, event.stream_id),
nscopy);
event.kernel_info.block_y, event.kernel_info.block_z,
event.annotation));
DeviceStepStats*& stream_dev_stats =
stream_dev_stats_map[std::make_pair(event.stream_id,
event.type)];
if (stream_dev_stats == nullptr) {
stream_dev_stats = step_stats->add_dev_stats();
stream_dev_stats->set_device(
absl::StrCat("/device:GPU:", device_ordinal,
"/stream:", event.stream_id));
}
*stream_dev_stats->add_node_stats() = *ns;
if (all_streams_dev_stats == nullptr) {
all_streams_dev_stats = step_stats->add_dev_stats();
all_streams_dev_stats->set_device(absl::StrCat(
"/device:GPU:", device_ordinal, "/stream:all"));
}
all_streams_dev_stats->add_node_stats()->Swap(ns);
break;
}
case CuptiTracerEventType::MemcpyH2D:
@ -341,17 +365,33 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
" to device:", event.memcpy_info.destination);
}
ns->set_timeline_label(std::move(details));
auto nscopy = new NodeExecStats(*ns);
collector->Save(memcpy_device, ns);
collector->Save(
absl::StrCat(stream_device, event.stream_id, "<",
GetTraceEventTypeName(event.type), ">"),
nscopy);
DeviceStepStats*& stream_dev_stats =
stream_dev_stats_map[std::make_pair(event.stream_id,
event.type)];
if (stream_dev_stats == nullptr) {
stream_dev_stats = step_stats->add_dev_stats();
stream_dev_stats->set_device(absl::StrCat(
"/device:GPU:", device_ordinal, "/stream:", event.stream_id,
"<", GetTraceEventTypeName(event.type), ">"));
}
*stream_dev_stats->add_node_stats() = *ns;
if (memcpy_dev_stats == nullptr) {
memcpy_dev_stats = step_stats->add_dev_stats();
memcpy_dev_stats->set_device(
absl::StrCat("/device:GPU:", device_ordinal, "/memcpy"));
}
memcpy_dev_stats->add_node_stats()->Swap(ns);
break;
}
default:
ns->set_timeline_label(activity_name);
collector->Save(stream_device, ns);
if (unknown_stream_dev_stats == nullptr) {
unknown_stream_dev_stats = step_stats->add_dev_stats();
unknown_stream_dev_stats->set_device(
absl::StrCat("/device:GPU:", device_ordinal, "/stream:"));
}
unknown_stream_dev_stats->add_node_stats()->Swap(ns);
break;
}
}
}
@ -360,8 +400,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
void Flush(uint64 start_gpu_ns, uint64 end_gpu_ns,
XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
absl::MutexLock lock(&mutex);
mutex_lock l(m);
// Tracking event types per line.
absl::flat_hash_map<int64, absl::flat_hash_set<CuptiTracerEventType>>
events_types_per_line;
@ -449,13 +488,9 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
}
}
absl::Mutex mutex;
std::string stream_device GUARDED_BY(mutex);
std::string memcpy_device GUARDED_BY(mutex);
std::string sync_device GUARDED_BY(mutex);
std::vector<CuptiTracerEvent> events GUARDED_BY(mutex);
absl::flat_hash_map<uint32, CorrelationInfo> correlation_info
GUARDED_BY(mutex);
mutex m;
std::vector<CuptiTracerEvent> events GUARDED_BY(m);
absl::flat_hash_map<uint32, CorrelationInfo> correlation_info GUARDED_BY(m);
};
absl::FixedArray<PerDeviceCollector> per_device_collector_;
@ -495,7 +530,6 @@ class GpuTracer : public profiler::ProfilerInterface {
CuptiTracer* cupti_tracer_;
CuptiTracerOptions options_;
StepStats step_stats_;
std::unique_ptr<CuptiTraceCollectorImpl> cupti_collector_;
};
@ -605,12 +639,11 @@ Status GpuTracer::CollectData(RunMetadata* run_metadata) {
return Status::OK();
case State::kStoppedOk: {
// Input run_metadata is shared by profiler interfaces, we need append.
StepStatsCollector step_stats_collector(&step_stats_);
StepStats step_stats;
if (cupti_collector_) {
cupti_collector_->Export(&step_stats_collector);
cupti_collector_->Export(&step_stats);
}
step_stats_collector.Finalize();
for (auto& dev_stats : *step_stats_.mutable_dev_stats()) {
for (auto& dev_stats : *step_stats.mutable_dev_stats()) {
run_metadata->mutable_step_stats()->add_dev_stats()->Swap(&dev_stats);
}
return Status::OK();

View File

@ -20,7 +20,6 @@ limitations under the License.
#include <vector>
#include "tensorflow/core/common_runtime/direct_session.h"
#include "tensorflow/core/common_runtime/step_stats_collector.h"
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/framework/graph.pb.h"
#include "tensorflow/core/framework/tensor.h"

View File

@ -58,7 +58,7 @@ class ProfilerInterface {
// Stops profiling.
virtual Status Stop() = 0;
// Saves collected profile data into step_stats_collector.
// Saves collected profile data into run_metadata.
// After this or the overload below are called once, subsequent calls might
// return empty data.
virtual Status CollectData(RunMetadata* run_metadata) = 0;

View File

@ -132,7 +132,10 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
int64 mem_total = 0;
int64 first_node_start_us =
step_stats.dev_stats(0).node_stats(0).all_start_micros();
(step_stats.dev_stats_size() > 0 &&
step_stats.dev_stats(0).node_stats_size() > 0)
? step_stats.dev_stats(0).node_stats(0).all_start_micros()
: 0;
int node_num = 0;
for (const auto& ds : step_stats.dev_stats()) {