Surface libcupti errors to OSS overview page.
PiperOrigin-RevId: 313274858 Change-Id: Ib65176246a378e0fbb8c43ec3eb369555dd43189
This commit is contained in:
parent
676a68963e
commit
c3ded069ab
@ -242,6 +242,7 @@ cc_library(
|
|||||||
"//tensorflow/core/profiler/utils:xplane_utils",
|
"//tensorflow/core/profiler/utils:xplane_utils",
|
||||||
"//tensorflow/core/profiler/utils:xplane_visitor",
|
"//tensorflow/core/profiler/utils:xplane_visitor",
|
||||||
"@com_google_absl//absl/container:flat_hash_map",
|
"@com_google_absl//absl/container:flat_hash_map",
|
||||||
|
"@com_google_absl//absl/container:flat_hash_set",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -294,6 +294,7 @@ OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
|
|||||||
bottleneck.input_classification(), bottleneck.input_statement(), "",
|
bottleneck.input_classification(), bottleneck.input_statement(), "",
|
||||||
hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()),
|
hardware_type, TfFunctionRecommendationHtml(op_stats.tf_function_db()),
|
||||||
overview_page.mutable_recommendation());
|
overview_page.mutable_recommendation());
|
||||||
|
*overview_page.mutable_errors() = op_stats.errors();
|
||||||
return overview_page;
|
return overview_page;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
|||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
#include "absl/container/flat_hash_map.h"
|
#include "absl/container/flat_hash_map.h"
|
||||||
|
#include "absl/container/flat_hash_set.h"
|
||||||
#include "tensorflow/core/platform/types.h"
|
#include "tensorflow/core/platform/types.h"
|
||||||
#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
|
#include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
|
||||||
#include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
|
#include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
|
||||||
@ -109,12 +110,20 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
|
|||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats) {
|
||||||
|
if (space.errors().empty()) return;
|
||||||
|
absl::flat_hash_set<std::string> unique_errors;
|
||||||
|
unique_errors.insert(space.errors().begin(), space.errors().end());
|
||||||
|
*op_stats->mutable_errors() = {unique_errors.begin(), unique_errors.end()};
|
||||||
|
}
|
||||||
|
|
||||||
OpStats ConvertXSpaceToOpStats(const XSpace& space) {
|
OpStats ConvertXSpaceToOpStats(const XSpace& space) {
|
||||||
const XPlane* host_plane = FindPlaneWithName(space, kHostThreads);
|
const XPlane* host_plane = FindPlaneWithName(space, kHostThreads);
|
||||||
std::vector<const XPlane*> device_planes =
|
std::vector<const XPlane*> device_planes =
|
||||||
FindPlanesWithPrefix(space, kGpuPlanePrefix);
|
FindPlanesWithPrefix(space, kGpuPlanePrefix);
|
||||||
OpStats op_stats;
|
OpStats op_stats;
|
||||||
StepEvents step_events;
|
StepEvents step_events;
|
||||||
|
PropagateXSpaceErrorsToOpStats(space, &op_stats);
|
||||||
// Convert device planes.
|
// Convert device planes.
|
||||||
OpMetricsDbCombiner op_metrics_db_combiner(
|
OpMetricsDbCombiner op_metrics_db_combiner(
|
||||||
op_stats.mutable_device_op_metrics_db());
|
op_stats.mutable_device_op_metrics_db());
|
||||||
|
@ -25,6 +25,9 @@ namespace profiler {
|
|||||||
// NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
|
// NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
|
||||||
OpStats ConvertXSpaceToOpStats(const XSpace& space);
|
OpStats ConvertXSpaceToOpStats(const XSpace& space);
|
||||||
|
|
||||||
|
// Propagate and dedup the errors in XSpace and add to OpStats.
|
||||||
|
void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats);
|
||||||
|
|
||||||
} // namespace profiler
|
} // namespace profiler
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
|
||||||
|
@ -185,6 +185,18 @@ TEST(ConcertXPlaneToOpStats, TfFunctionTest) {
|
|||||||
EXPECT_EQ(not_traced_mode.self_time_ps(), 20);
|
EXPECT_EQ(not_traced_mode.self_time_ps(), 20);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
|
||||||
|
XSpace space;
|
||||||
|
static constexpr char kError[] = "host: error";
|
||||||
|
*space.add_errors() = kError;
|
||||||
|
*space.add_errors() = kError;
|
||||||
|
|
||||||
|
OpStats op_stats = ConvertXSpaceToOpStats(space);
|
||||||
|
|
||||||
|
EXPECT_EQ(1, op_stats.errors_size());
|
||||||
|
EXPECT_EQ(kError, op_stats.errors(/*index=*/0));
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
} // namespace profiler
|
} // namespace profiler
|
||||||
} // namespace tensorflow
|
} // namespace tensorflow
|
||||||
|
@ -20,6 +20,7 @@ limitations under the License.
|
|||||||
#include "absl/container/node_hash_map.h"
|
#include "absl/container/node_hash_map.h"
|
||||||
#include "tensorflow/core/platform/env.h"
|
#include "tensorflow/core/platform/env.h"
|
||||||
#include "tensorflow/core/platform/errors.h"
|
#include "tensorflow/core/platform/errors.h"
|
||||||
|
#include "tensorflow/core/platform/host_info.h"
|
||||||
#include "tensorflow/core/platform/logging.h"
|
#include "tensorflow/core/platform/logging.h"
|
||||||
#include "tensorflow/core/platform/macros.h"
|
#include "tensorflow/core/platform/macros.h"
|
||||||
#include "tensorflow/core/platform/mem.h"
|
#include "tensorflow/core/platform/mem.h"
|
||||||
@ -1264,6 +1265,11 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
|
|||||||
std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
|
std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
|
||||||
TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
|
TF_DISALLOW_COPY_AND_ASSIGN(CuptiDriverApiHookWithCudaEvent);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
/*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
|
||||||
|
return absl::StrCat(port::Hostname(), ": ", error_message);
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
/*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
|
/*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
|
||||||
@ -1669,11 +1675,13 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
|
|||||||
|
|
||||||
/*static*/ std::string CuptiTracer::ErrorIfAny() {
|
/*static*/ std::string CuptiTracer::ErrorIfAny() {
|
||||||
if (CuptiTracer::NumGpus() == 0) {
|
if (CuptiTracer::NumGpus() == 0) {
|
||||||
return "No GPU detected.";
|
return ErrorWithHostname("No GPU detected.");
|
||||||
} else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
|
} else if (CuptiTracer::GetCuptiTracerSingleton()->NeedRootAccess()) {
|
||||||
return "Insufficient privilege to run libcupti (you need root permission).";
|
return ErrorWithHostname(
|
||||||
|
"Insufficient privilege to run libcupti (you need root permission).");
|
||||||
} else if (CuptiTracer::GetTimestamp() == 0) {
|
} else if (CuptiTracer::GetTimestamp() == 0) {
|
||||||
return "Failed to load libcupti (is it installed and accessible?)";
|
return ErrorWithHostname(
|
||||||
|
"Failed to load libcupti (is it installed and accessible?)");
|
||||||
}
|
}
|
||||||
return "";
|
return "";
|
||||||
}
|
}
|
||||||
|
@ -659,12 +659,16 @@ Status GpuTracer::CollectData(XSpace* space) {
|
|||||||
case State::kStartedOk:
|
case State::kStartedOk:
|
||||||
return errors::FailedPrecondition("Cannot collect trace before stopping");
|
return errors::FailedPrecondition("Cannot collect trace before stopping");
|
||||||
case State::kStartedError:
|
case State::kStartedError:
|
||||||
LOG(ERROR) << "Cannot collect, xprof failed to start";
|
LOG(ERROR) << "Cannot collect, profiler failed to start";
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
case State::kStoppedError:
|
case State::kStoppedError:
|
||||||
VLOG(1) << "No trace data collected";
|
VLOG(1) << "No trace data collected";
|
||||||
return Status::OK();
|
return Status::OK();
|
||||||
case State::kStoppedOk: {
|
case State::kStoppedOk: {
|
||||||
|
std::string cupti_error = CuptiTracer::ErrorIfAny();
|
||||||
|
if (!cupti_error.empty()) {
|
||||||
|
space->add_errors(cupti_error);
|
||||||
|
}
|
||||||
if (cupti_collector_) {
|
if (cupti_collector_) {
|
||||||
cupti_collector_->Export(space);
|
cupti_collector_->Export(space);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user