Surface libcupti errors to OSS overview page.

PiperOrigin-RevId: 313274858
Change-Id: Ib65176246a378e0fbb8c43ec3eb369555dd43189
This commit is contained in:
A. Unique TensorFlower 2020-05-26 15:04:43 -07:00 committed by TensorFlower Gardener
parent 676a68963e
commit c3ded069ab
7 changed files with 42 additions and 4 deletions

View File

@ -242,6 +242,7 @@ cc_library(
"//tensorflow/core/profiler/utils:xplane_utils",
"//tensorflow/core/profiler/utils:xplane_visitor",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/container:flat_hash_set",
],
)

View File

@ -294,6 +294,7 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
bottleneck.input_classification(), bottleneck.input_statement(), "",
hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()),
overview_page.mutable_recommendation());
*overview_page.mutable_errors() = op_stats.errors();
return overview_page;
}

View File

@ -18,6 +18,7 @@ limitations under the License.
#include <vector>
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
#include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
@ -109,12 +110,20 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
} // namespace
void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats) {
if (space.errors().empty()) return;
absl::flat_hash_set<std::string> unique_errors;
unique_errors.insert(space.errors().begin(), space.errors().end());
*op_stats->mutable_errors() = {unique_errors.begin(), unique_errors.end()};
}
OpStats ConvertXSpaceToOpStats(const XSpace& space) {
const XPlane* host_plane = FindPlaneWithName(space, kHostThreads);
std::vector<const XPlane*> device_planes =
FindPlanesWithPrefix(space, kGpuPlanePrefix);
OpStats op_stats;
StepEvents step_events;
PropagateXSpaceErrorsToOpStats(space, &op_stats);
// Convert device planes.
OpMetricsDbCombiner op_metrics_db_combiner(
op_stats.mutable_device_op_metrics_db());

View File

@ -25,6 +25,9 @@ namespace profiler {
// NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
OpStats ConvertXSpaceToOpStats(const XSpace& space);
// Propagate and dedup the errors in XSpace and add to OpStats.
void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats);
} // namespace profiler
} // namespace tensorflow

View File

@ -185,6 +185,18 @@ TEST(ConcertXPlaneToOpStats, TfFunctionTest) {
EXPECT_EQ(not_traced_mode.self_time_ps(), 20);
}
TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
XSpace space;
static constexpr char kError[] = "host: error";
*space.add_errors() = kError;
*space.add_errors() = kError;
OpStats op_stats = ConvertXSpaceToOpStats(space);
EXPECT_EQ(1, op_stats.errors_size());
EXPECT_EQ(kError, op_stats.errors(/*index=*/0));
}
} // namespace
} // namespace profiler
} // namespace tensorflow

View File

@ -20,6 +20,7 @@ limitations under the License.
#include "absl/container/node_hash_map.h"
#include "tensorflow/core/platform/env.h"
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/host_info.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mem.h"
@ -1264,6 +1265,11 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
};
/*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
return absl::StrCat(port::Hostname(), ": ", error_message);
}
} // namespace
/*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
@ -1669,11 +1675,13 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
/*static*/ std::string CuptiTracer::ErrorIfAny() {
if (CuptiTracer::NumGpus() == 0) {
return "No GPU detected.";
return ErrorWithHostname("No GPU detected.");
} else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
return "Insufficient privilege to run libcupti (you need root permission).";
return ErrorWithHostname(
"Insufficient privilege to run libcupti (you need root permission).");
} else if (CuptiTracer::GetTimestamp() == 0) {
return "Failed to load libcupti (is it installed and accessible?)";
return ErrorWithHostname(
"Failed to load libcupti (is it installed and accessible?)");
}
return "";
}

View File

@ -659,12 +659,16 @@ Status GpuTracer::CollectData(XSpace* space) {
case State::kStartedOk:
return errors::FailedPrecondition("Cannot collect trace before stopping");
case State::kStartedError:
LOG(ERROR) << "Cannot collect, xprof failed to start";
LOG(ERROR) << "Cannot collect, profiler failed to start";
return Status::OK();
case State::kStoppedError:
VLOG(1) << "No trace data collected";
return Status::OK();
case State::kStoppedOk: {
std::string cupti_error = CuptiTracer::ErrorIfAny();
if (!cupti_error.empty()) {
space->add_errors(cupti_error);
}
if (cupti_collector_) {
cupti_collector_->Export(space);
}