From c3ded069abd157c5a311e970d943c6f93c5318d0 Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" <gardener@tensorflow.org> Date: Tue, 26 May 2020 15:04:43 -0700 Subject: [PATCH] Surface libcupti errors to OSS overview page. PiperOrigin-RevId: 313274858 Change-Id: Ib65176246a378e0fbb8c43ec3eb369555dd43189 --- tensorflow/core/profiler/convert/BUILD | 1 + .../profiler/convert/op_stats_to_overview_page.cc | 1 + .../core/profiler/convert/xplane_to_op_stats.cc | 9 +++++++++ .../core/profiler/convert/xplane_to_op_stats.h | 3 +++ .../profiler/convert/xplane_to_op_stats_test.cc | 12 ++++++++++++ .../core/profiler/internal/gpu/cupti_tracer.cc | 14 +++++++++++--- .../core/profiler/internal/gpu/device_tracer.cc | 6 +++++- 7 files changed, 42 insertions(+), 4 deletions(-) diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD index 369d26a92d9..390f94157c3 100644 --- a/tensorflow/core/profiler/convert/BUILD +++ b/tensorflow/core/profiler/convert/BUILD @@ -242,6 +242,7 @@ cc_library( "//tensorflow/core/profiler/utils:xplane_utils", "//tensorflow/core/profiler/utils:xplane_visitor", "@com_google_absl//absl/container:flat_hash_map", + "@com_google_absl//absl/container:flat_hash_set", ], ) diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc index bec92e0d998..330b488dc8f 100644 --- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc +++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc @@ -294,6 +294,7 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats, bottleneck.input_classification(), bottleneck.input_statement(), "", hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()), overview_page.mutable_recommendation()); + *overview_page.mutable_errors() = op_stats.errors(); return overview_page; } diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc index f008219cbd2..4d2a45747e0 100644 --- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc +++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc @@ -18,6 +18,7 @@ limitations under the License. #include <vector> #include "absl/container/flat_hash_map.h" +#include "absl/container/flat_hash_set.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h" #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h" @@ -109,12 +110,20 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events, } // namespace +void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats) { + if (space.errors().empty()) return; + absl::flat_hash_set<std::string> unique_errors; + unique_errors.insert(space.errors().begin(), space.errors().end()); + *op_stats->mutable_errors() = {unique_errors.begin(), unique_errors.end()}; +} + OpStats ConvertXSpaceToOpStats(const XSpace& space) { const XPlane* host_plane = FindPlaneWithName(space, kHostThreads); std::vector<const XPlane*> device_planes = FindPlanesWithPrefix(space, kGpuPlanePrefix); OpStats op_stats; StepEvents step_events; + PropagateXSpaceErrorsToOpStats(space, &op_stats); // Convert device planes. OpMetricsDbCombiner op_metrics_db_combiner( op_stats.mutable_device_op_metrics_db()); diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h index 2d30a5d5fad..4708caa5aae 100644 --- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h +++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h @@ -25,6 +25,9 @@ namespace profiler { // NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated. OpStats ConvertXSpaceToOpStats(const XSpace& space); +// Propagate and dedup the errors in XSpace and add to OpStats. +void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats); + } // namespace profiler } // namespace tensorflow diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc index 7b4652f6c0b..67901e83dd3 100644 --- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc +++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc @@ -185,6 +185,18 @@ TEST(ConcertXPlaneToOpStats, TfFunctionTest) { EXPECT_EQ(not_traced_mode.self_time_ps(), 20); } +TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) { + XSpace space; + static constexpr char kError[] = "host: error"; + *space.add_errors() = kError; + *space.add_errors() = kError; + + OpStats op_stats = ConvertXSpaceToOpStats(space); + + EXPECT_EQ(1, op_stats.errors_size()); + EXPECT_EQ(kError, op_stats.errors(/*index=*/0)); +} + } // namespace } // namespace profiler } // namespace tensorflow diff --git a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc index 51f89bd7b0a..ab16693deae 100644 --- a/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc +++ b/tensorflow/core/profiler/internal/gpu/cupti_tracer.cc @@ -20,6 +20,7 @@ limitations under the License. #include "absl/container/node_hash_map.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/errors.h" +#include "tensorflow/core/platform/host_info.h" #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/platform/macros.h" #include "tensorflow/core/platform/mem.h" @@ -1264,6 +1265,11 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook { std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_; TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent); }; + +/*static*/ std::string ErrorWithHostname(absl::string_view error_message) { + return absl::StrCat(port::Hostname(), ": ", error_message); +} + } // namespace /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent( @@ -1669,11 +1675,13 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id, /*static*/ std::string CuptiTracer::ErrorIfAny() { if (CuptiTracer::NumGpus() == 0) { - return "No GPU detected."; + return ErrorWithHostname("No GPU detected."); } else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) { - return "Insufficient privilege to run libcupti (you need root permission)."; + return ErrorWithHostname( + "Insufficient privilege to run libcupti (you need root permission)."); } else if (CuptiTracer::GetTimestamp() == 0) { - return "Failed to load libcupti (is it installed and accessible?)"; + return ErrorWithHostname( + "Failed to load libcupti (is it installed and accessible?)"); } return ""; } diff --git a/tensorflow/core/profiler/internal/gpu/device_tracer.cc b/tensorflow/core/profiler/internal/gpu/device_tracer.cc index ac6662c8432..0370f6a51f9 100644 --- a/tensorflow/core/profiler/internal/gpu/device_tracer.cc +++ b/tensorflow/core/profiler/internal/gpu/device_tracer.cc @@ -659,12 +659,16 @@ Status GpuTracer::CollectData(XSpace* space) { case State::kStartedOk: return errors::FailedPrecondition("Cannot collect trace before stopping"); case State::kStartedError: - LOG(ERROR) << "Cannot collect, xprof failed to start"; + LOG(ERROR) << "Cannot collect, profiler failed to start"; return Status::OK(); case State::kStoppedError: VLOG(1) << "No trace data collected"; return Status::OK(); case State::kStoppedOk: { + std::string cupti_error = CuptiTracer::ErrorIfAny(); + if (!cupti_error.empty()) { + space->add_errors(cupti_error); + } if (cupti_collector_) { cupti_collector_->Export(space); }