diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD index 5a2fd86f04d..74280f1d126 100644 --- a/tensorflow/core/profiler/convert/BUILD +++ b/tensorflow/core/profiler/convert/BUILD @@ -228,6 +228,7 @@ cc_library( ":xplane_to_step_events", ":xplane_to_tf_functions", "//tensorflow/core:lib", + "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc", "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc", "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc", "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc", @@ -261,6 +262,7 @@ tf_cc_test( "//tensorflow/core:test", "//tensorflow/core:test_main", "//tensorflow/core:testlib", + "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc", "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc", "//tensorflow/core/profiler/protobuf:op_stats_proto_cc", "//tensorflow/core/profiler/protobuf:steps_db_proto_cc", diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc index ccd7c54fa19..9d3aca9d831 100644 --- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc +++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc @@ -26,6 +26,7 @@ limitations under the License. #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h" #include "tensorflow/core/profiler/convert/xplane_to_step_events.h" #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h" +#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h" #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h" #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h" #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h" @@ -114,11 +115,20 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events, } // namespace -void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats) { - if (space.errors().empty()) return; - absl::flat_hash_set unique_errors; - unique_errors.insert(space.errors().begin(), space.errors().end()); - *op_stats->mutable_errors() = {unique_errors.begin(), unique_errors.end()}; +void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space, + OpStats* op_stats) { + if (!space.errors().empty()) { + absl::flat_hash_set unique_errors; + unique_errors.insert(space.errors().begin(), space.errors().end()); + *op_stats->mutable_diagnostics()->mutable_errors() = {unique_errors.begin(), + unique_errors.end()}; + } + if (!space.warnings().empty()) { + absl::flat_hash_set unique_warnings; + unique_warnings.insert(space.warnings().begin(), space.warnings().end()); + *op_stats->mutable_diagnostics()->mutable_warnings() = { + unique_warnings.begin(), unique_warnings.end()}; + } } OpStats ConvertXSpaceToOpStats(const XSpace& space) { @@ -127,7 +137,7 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space) { FindPlanesWithPrefix(space, kGpuPlanePrefix); OpStats op_stats; StepEvents step_events; - PropagateXSpaceErrorsToOpStats(space, &op_stats); + PropagateXSpaceDiagnosticsToOpStats(space, &op_stats); // Convert device planes. OpMetricsDbCombiner op_metrics_db_combiner( op_stats.mutable_device_op_metrics_db()); diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h index 4708caa5aae..073f7581b01 100644 --- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h +++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h @@ -25,8 +25,9 @@ namespace profiler { // NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated. OpStats ConvertXSpaceToOpStats(const XSpace& space); -// Propagate and dedup the errors in XSpace and add to OpStats. -void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats); +// Propagate and dedup the diagnostics in XSpace and add to OpStats. +void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space, + OpStats* op_stats); } // namespace profiler } // namespace tensorflow diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc index 5c1b6f8a89e..1812a5592bc 100644 --- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc +++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc @@ -18,6 +18,7 @@ limitations under the License. #include "absl/strings/str_cat.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h" #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h" #include "tensorflow/core/profiler/protobuf/op_stats.pb.h" #include "tensorflow/core/profiler/protobuf/steps_db.pb.h" @@ -200,8 +201,8 @@ TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) { OpStats op_stats = ConvertXSpaceToOpStats(space); - EXPECT_EQ(1, op_stats.errors_size()); - EXPECT_EQ(kError, op_stats.errors(/*index=*/0)); + EXPECT_EQ(1, op_stats.diagnostics().errors_size()); + EXPECT_EQ(kError, op_stats.diagnostics().errors(/*index=*/0)); } } // namespace diff --git a/tensorflow/core/profiler/internal/gpu/BUILD b/tensorflow/core/profiler/internal/gpu/BUILD index 2bb0805a592..987bc5ea336 100644 --- a/tensorflow/core/profiler/internal/gpu/BUILD +++ b/tensorflow/core/profiler/internal/gpu/BUILD @@ -47,6 +47,7 @@ tf_cuda_library( "//tensorflow/core/profiler/utils:xplane_schema", "//tensorflow/core/profiler/utils:xplane_utils", "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/synchronization", ], alwayslink = 1, ) diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc index ab16693deae..734b4ad829e 100644 --- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc +++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc @@ -1668,7 +1668,7 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id, if (dropped != 0) { uint32 device_id = -1; RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id)); - collector_->OnEventsDropped("CUpti activity buffer", dropped); + collector_->OnEventsDropped("cupti activity buffer full", dropped); } return Status::OK(); } diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc index acb52df054b..5ddee687333 100644 --- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc +++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc @@ -26,10 +26,12 @@ limitations under the License. #include "absl/strings/str_cat.h" #include "absl/strings/str_format.h" #include "absl/strings/str_join.h" +#include "absl/synchronization/mutex.h" #include "tensorflow/core/framework/step_stats.pb.h" #include "tensorflow/core/platform/abi.h" #include "tensorflow/core/platform/env_time.h" #include "tensorflow/core/platform/errors.h" +#include "tensorflow/core/platform/host_info.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/mutex.h" #include "tensorflow/core/platform/thread_annotations.h" @@ -189,25 +191,28 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector { if (event.device_id >= num_gpus_) return; if (event.source == CuptiTracerEventSource::DriverCallback) { if (num_callback_events_ > options_.max_callback_api_events) { - OnEventsDropped("trace collector", 1); + OnEventsDropped("total driver(callback) events reaches max", 1); return; } num_callback_events_++; } else { if (num_activity_events_ > options_.max_activity_api_events) { - OnEventsDropped("trace collector", 1); + OnEventsDropped("total device(activity) events reaches max", 1); return; } num_activity_events_++; } per_device_collector_[event.device_id].AddEvent(std::move(event)); } - void OnEventsDropped(const std::string& reason, uint32 num_events) override {} + void OnEventsDropped(const std::string& reason, uint32 num_events) override { + absl::MutexLock lock(&mutex_); + dropped_events_[reason] += num_events; + } void Flush() override {} void Export(StepStats* step_stats) { LOG(INFO) << " GpuTracer has collected " << num_callback_events_ << " callback api events and " << num_activity_events_ - << " activity events."; + << " activity events. " << ReportDroppedEvents(); for (int i = 0; i < num_gpus_; ++i) { per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_, step_stats); @@ -216,7 +221,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector { void Export(XSpace* space) { LOG(INFO) << " GpuTracer has collected " << num_callback_events_ << " callback api events and " << num_activity_events_ - << " activity events."; + << " activity events. " << ReportDroppedEvents(); uint64 end_gpu_ns = CuptiTracer::GetTimestamp(); XPlaneBuilder host_plane(GetOrCreatePlane(space, kCuptiDriverApiPlaneName)); host_plane.SetId(kCuptiDriverApiPlaneId); @@ -232,10 +237,32 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector { } NormalizeTimeStamps(&host_plane, start_walltime_ns_); } + std::string ReportDroppedEvents() { + absl::MutexLock lock(&mutex_); + string result; + for (const auto dropped : dropped_events_) { + absl::StrAppend(&result, " ", dropped.second, " events dropped because ", + dropped.first, ";"); + } + if (!result.empty()) result.back() = '.'; + return result; + } + std::string ReportNumEventsIfDropped() { + std::string events_dropped = ReportDroppedEvents(); + if (events_dropped.empty()) return ""; + return absl::StrCat("Detected GPU events dropped on ", port::Hostname(), + ": Profiler has collected ", + num_callback_events_.load(), " driver events and ", + num_activity_events_.load(), " device events.", + events_dropped); + } private: std::atomic num_callback_events_; std::atomic num_activity_events_; + absl::Mutex mutex_; + absl::flat_hash_map dropped_events_ + ABSL_GUARDED_BY(mutex_); uint64 start_walltime_ns_; uint64 start_gpu_ns_; int num_gpus_; @@ -669,7 +696,11 @@ Status GpuTracer::CollectData(XSpace* space) { case State::kStoppedOk: { std::string cupti_error = CuptiTracer::ErrorIfAny(); if (!cupti_error.empty()) { - space->add_errors(cupti_error); + space->add_errors(std::move(cupti_error)); + } + std::string events_dropped = cupti_collector_->ReportNumEventsIfDropped(); + if (!events_dropped.empty()) { + space->add_warnings(std::move(events_dropped)); } if (cupti_collector_) { cupti_collector_->Export(space); diff --git a/tensorflow/core/profiler/protobuf/BUILD b/tensorflow/core/profiler/protobuf/BUILD index cd84aeb6259..81bc222e119 100644 --- a/tensorflow/core/profiler/protobuf/BUILD +++ b/tensorflow/core/profiler/protobuf/BUILD @@ -87,6 +87,7 @@ tf_proto_library( srcs = ["op_stats.proto"], cc_api_version = 2, protodeps = [ + ":diagnostics_proto", ":kernel_stats_proto", ":op_metrics_proto", ":steps_db_proto", diff --git a/tensorflow/core/profiler/protobuf/op_stats.proto b/tensorflow/core/profiler/protobuf/op_stats.proto index aa7cd563a33..4800e88a50a 100644 --- a/tensorflow/core/profiler/protobuf/op_stats.proto +++ b/tensorflow/core/profiler/protobuf/op_stats.proto @@ -2,6 +2,7 @@ syntax = "proto3"; package tensorflow.profiler; +import "tensorflow/core/profiler/protobuf/diagnostics.proto"; import "tensorflow/core/profiler/protobuf/kernel_stats.proto"; import "tensorflow/core/profiler/protobuf/op_metrics.proto"; import "tensorflow/core/profiler/protobuf/steps_db.proto"; @@ -107,6 +108,7 @@ message OpStats { KernelStatsDb kernel_stats_db = 6; // Statistics for all tf-functions. TfFunctionDb tf_function_db = 8; - // Errors seen. - repeated string errors = 7; + // Error and warning messages for diagnosing profiling issues. + Diagnostics diagnostics = 9; + reserved 7; } diff --git a/tensorflow/core/profiler/protobuf/xplane.proto b/tensorflow/core/profiler/protobuf/xplane.proto index 7b4a7b0e658..48aa38dafff 100644 --- a/tensorflow/core/profiler/protobuf/xplane.proto +++ b/tensorflow/core/profiler/protobuf/xplane.proto @@ -5,11 +5,13 @@ package tensorflow.profiler; option cc_enable_arenas = true; // A container of parallel XPlanes, generated by one or more profiling sources. -// Next ID: 3 +// Next ID: 4 message XSpace { repeated XPlane planes = 1; // Errors (if any) in the generation of planes. repeated string errors = 2; + // Warnings (if any) in the generation of planes; + repeated string warnings = 3; } // An XPlane is a container of parallel timelines (XLines), generated by a diff --git a/tensorflow/core/profiler/utils/diagnostics.cc b/tensorflow/core/profiler/utils/diagnostics.cc index dc89531b867..c15ff513f9a 100644 --- a/tensorflow/core/profiler/utils/diagnostics.cc +++ b/tensorflow/core/profiler/utils/diagnostics.cc @@ -49,7 +49,7 @@ void PopulateStepDiagnostics(const OpStats& op_stats, Diagnostics* diag) { } void PopulateOverviewDiagnostics(const OpStats& op_stats, Diagnostics* diag) { - *diag->mutable_errors() = op_stats.errors(); + *diag->mutable_errors() = op_stats.diagnostics().errors(); absl::c_sort(*diag->mutable_errors()); if (diag->errors().empty()) { // Shows run-environment error only if there is no other existing error. @@ -58,6 +58,7 @@ void PopulateOverviewDiagnostics(const OpStats& op_stats, Diagnostics* diag) { *diag->add_errors() = std::string(kNoDeviceTraceCollected); } } + *diag->mutable_warnings() = op_stats.diagnostics().warnings(); PopulateStepDiagnostics(op_stats, diag); }