Surface num dropped events as a warning message.
PiperOrigin-RevId: 316138752 Change-Id: I3ea9d2c17df0dfd40c23a32ce5b1b02a80c0a9ff
This commit is contained in:
parent
76fa5e8a4a
commit
2dac8cf550
@ -228,6 +228,7 @@ cc_library(
|
||||
":xplane_to_step_events",
|
||||
":xplane_to_tf_functions",
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
|
||||
@ -261,6 +262,7 @@ tf_cc_test(
|
||||
"//tensorflow/core:test",
|
||||
"//tensorflow/core:test_main",
|
||||
"//tensorflow/core:testlib",
|
||||
"//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
|
||||
|
@ -26,6 +26,7 @@ limitations under the License.
|
||||
#include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
|
||||
#include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
|
||||
#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
|
||||
#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
|
||||
@ -114,11 +115,20 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
|
||||
|
||||
} // namespace
|
||||
|
||||
void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats) {
|
||||
if (space.errors().empty()) return;
|
||||
void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
|
||||
OpStats* op_stats) {
|
||||
if (!space.errors().empty()) {
|
||||
absl::flat_hash_set<std::string> unique_errors;
|
||||
unique_errors.insert(space.errors().begin(), space.errors().end());
|
||||
*op_stats->mutable_errors() = {unique_errors.begin(), unique_errors.end()};
|
||||
*op_stats->mutable_diagnostics()->mutable_errors() = {unique_errors.begin(),
|
||||
unique_errors.end()};
|
||||
}
|
||||
if (!space.warnings().empty()) {
|
||||
absl::flat_hash_set<std::string> unique_warnings;
|
||||
unique_warnings.insert(space.warnings().begin(), space.warnings().end());
|
||||
*op_stats->mutable_diagnostics()->mutable_warnings() = {
|
||||
unique_warnings.begin(), unique_warnings.end()};
|
||||
}
|
||||
}
|
||||
|
||||
OpStats ConvertXSpaceToOpStats(const XSpace& space) {
|
||||
@ -127,7 +137,7 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space) {
|
||||
FindPlanesWithPrefix(space, kGpuPlanePrefix);
|
||||
OpStats op_stats;
|
||||
StepEvents step_events;
|
||||
PropagateXSpaceErrorsToOpStats(space, &op_stats);
|
||||
PropagateXSpaceDiagnosticsToOpStats(space, &op_stats);
|
||||
// Convert device planes.
|
||||
OpMetricsDbCombiner op_metrics_db_combiner(
|
||||
op_stats.mutable_device_op_metrics_db());
|
||||
|
@ -25,8 +25,9 @@ namespace profiler {
|
||||
// NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
|
||||
OpStats ConvertXSpaceToOpStats(const XSpace& space);
|
||||
|
||||
// Propagate and dedup the errors in XSpace and add to OpStats.
|
||||
void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats);
|
||||
// Propagate and dedup the diagnostics in XSpace and add to OpStats.
|
||||
void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
|
||||
OpStats* op_stats);
|
||||
|
||||
} // namespace profiler
|
||||
} // namespace tensorflow
|
||||
|
@ -18,6 +18,7 @@ limitations under the License.
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "tensorflow/core/platform/test.h"
|
||||
#include "tensorflow/core/platform/types.h"
|
||||
#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
|
||||
#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
|
||||
@ -200,8 +201,8 @@ TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
|
||||
|
||||
OpStats op_stats = ConvertXSpaceToOpStats(space);
|
||||
|
||||
EXPECT_EQ(1, op_stats.errors_size());
|
||||
EXPECT_EQ(kError, op_stats.errors(/*index=*/0));
|
||||
EXPECT_EQ(1, op_stats.diagnostics().errors_size());
|
||||
EXPECT_EQ(kError, op_stats.diagnostics().errors(/*index=*/0));
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
@ -47,6 +47,7 @@ tf_cuda_library(
|
||||
"//tensorflow/core/profiler/utils:xplane_schema",
|
||||
"//tensorflow/core/profiler/utils:xplane_utils",
|
||||
"@com_google_absl//absl/container:flat_hash_map",
|
||||
"@com_google_absl//absl/synchronization",
|
||||
],
|
||||
alwayslink = 1,
|
||||
)
|
||||
|
@ -1668,7 +1668,7 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
|
||||
if (dropped != 0) {
|
||||
uint32 device_id = -1;
|
||||
RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
|
||||
collector_->OnEventsDropped("CUpti activity buffer", dropped);
|
||||
collector_->OnEventsDropped("cupti activity buffer full", dropped);
|
||||
}
|
||||
return Status::OK();
|
||||
}
|
||||
|
@ -26,10 +26,12 @@ limitations under the License.
|
||||
#include "absl/strings/str_cat.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/strings/str_join.h"
|
||||
#include "absl/synchronization/mutex.h"
|
||||
#include "tensorflow/core/framework/step_stats.pb.h"
|
||||
#include "tensorflow/core/platform/abi.h"
|
||||
#include "tensorflow/core/platform/env_time.h"
|
||||
#include "tensorflow/core/platform/errors.h"
|
||||
#include "tensorflow/core/platform/host_info.h"
|
||||
#include "tensorflow/core/platform/macros.h"
|
||||
#include "tensorflow/core/platform/mutex.h"
|
||||
#include "tensorflow/core/platform/thread_annotations.h"
|
||||
@ -189,25 +191,28 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
|
||||
if (event.device_id >= num_gpus_) return;
|
||||
if (event.source == CuptiTracerEventSource::DriverCallback) {
|
||||
if (num_callback_events_ > options_.max_callback_api_events) {
|
||||
OnEventsDropped("trace collector", 1);
|
||||
OnEventsDropped("total driver(callback) events reaches max", 1);
|
||||
return;
|
||||
}
|
||||
num_callback_events_++;
|
||||
} else {
|
||||
if (num_activity_events_ > options_.max_activity_api_events) {
|
||||
OnEventsDropped("trace collector", 1);
|
||||
OnEventsDropped("total device(activity) events reaches max", 1);
|
||||
return;
|
||||
}
|
||||
num_activity_events_++;
|
||||
}
|
||||
per_device_collector_[event.device_id].AddEvent(std::move(event));
|
||||
}
|
||||
void OnEventsDropped(const std::string& reason, uint32 num_events) override {}
|
||||
void OnEventsDropped(const std::string& reason, uint32 num_events) override {
|
||||
absl::MutexLock lock(&mutex_);
|
||||
dropped_events_[reason] += num_events;
|
||||
}
|
||||
void Flush() override {}
|
||||
void Export(StepStats* step_stats) {
|
||||
LOG(INFO) << " GpuTracer has collected " << num_callback_events_
|
||||
<< " callback api events and " << num_activity_events_
|
||||
<< " activity events.";
|
||||
<< " activity events. " << ReportDroppedEvents();
|
||||
for (int i = 0; i < num_gpus_; ++i) {
|
||||
per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_,
|
||||
step_stats);
|
||||
@ -216,7 +221,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
|
||||
void Export(XSpace* space) {
|
||||
LOG(INFO) << " GpuTracer has collected " << num_callback_events_
|
||||
<< " callback api events and " << num_activity_events_
|
||||
<< " activity events.";
|
||||
<< " activity events. " << ReportDroppedEvents();
|
||||
uint64 end_gpu_ns = CuptiTracer::GetTimestamp();
|
||||
XPlaneBuilder host_plane(GetOrCreatePlane(space, kCuptiDriverApiPlaneName));
|
||||
host_plane.SetId(kCuptiDriverApiPlaneId);
|
||||
@ -232,10 +237,32 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
|
||||
}
|
||||
NormalizeTimeStamps(&host_plane, start_walltime_ns_);
|
||||
}
|
||||
std::string ReportDroppedEvents() {
|
||||
absl::MutexLock lock(&mutex_);
|
||||
string result;
|
||||
for (const auto dropped : dropped_events_) {
|
||||
absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
|
||||
dropped.first, ";");
|
||||
}
|
||||
if (!result.empty()) result.back() = '.';
|
||||
return result;
|
||||
}
|
||||
std::string ReportNumEventsIfDropped() {
|
||||
std::string events_dropped = ReportDroppedEvents();
|
||||
if (events_dropped.empty()) return "";
|
||||
return absl::StrCat("Detected GPU events dropped on ", port::Hostname(),
|
||||
": Profiler has collected ",
|
||||
num_callback_events_.load(), " driver events and ",
|
||||
num_activity_events_.load(), " device events.",
|
||||
events_dropped);
|
||||
}
|
||||
|
||||
private:
|
||||
std::atomic<int> num_callback_events_;
|
||||
std::atomic<int> num_activity_events_;
|
||||
absl::Mutex mutex_;
|
||||
absl::flat_hash_map<std::string, uint64> dropped_events_
|
||||
ABSL_GUARDED_BY(mutex_);
|
||||
uint64 start_walltime_ns_;
|
||||
uint64 start_gpu_ns_;
|
||||
int num_gpus_;
|
||||
@ -669,7 +696,11 @@ Status GpuTracer::CollectData(XSpace* space) {
|
||||
case State::kStoppedOk: {
|
||||
std::string cupti_error = CuptiTracer::ErrorIfAny();
|
||||
if (!cupti_error.empty()) {
|
||||
space->add_errors(cupti_error);
|
||||
space->add_errors(std::move(cupti_error));
|
||||
}
|
||||
std::string events_dropped = cupti_collector_->ReportNumEventsIfDropped();
|
||||
if (!events_dropped.empty()) {
|
||||
space->add_warnings(std::move(events_dropped));
|
||||
}
|
||||
if (cupti_collector_) {
|
||||
cupti_collector_->Export(space);
|
||||
|
@ -87,6 +87,7 @@ tf_proto_library(
|
||||
srcs = ["op_stats.proto"],
|
||||
cc_api_version = 2,
|
||||
protodeps = [
|
||||
":diagnostics_proto",
|
||||
":kernel_stats_proto",
|
||||
":op_metrics_proto",
|
||||
":steps_db_proto",
|
||||
|
@ -2,6 +2,7 @@ syntax = "proto3";
|
||||
|
||||
package tensorflow.profiler;
|
||||
|
||||
import "tensorflow/core/profiler/protobuf/diagnostics.proto";
|
||||
import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
|
||||
import "tensorflow/core/profiler/protobuf/op_metrics.proto";
|
||||
import "tensorflow/core/profiler/protobuf/steps_db.proto";
|
||||
@ -107,6 +108,7 @@ message OpStats {
|
||||
KernelStatsDb kernel_stats_db = 6;
|
||||
// Statistics for all tf-functions.
|
||||
TfFunctionDb tf_function_db = 8;
|
||||
// Errors seen.
|
||||
repeated string errors = 7;
|
||||
// Error and warning messages for diagnosing profiling issues.
|
||||
Diagnostics diagnostics = 9;
|
||||
reserved 7;
|
||||
}
|
||||
|
@ -5,11 +5,13 @@ package tensorflow.profiler;
|
||||
option cc_enable_arenas = true;
|
||||
|
||||
// A container of parallel XPlanes, generated by one or more profiling sources.
|
||||
// Next ID: 3
|
||||
// Next ID: 4
|
||||
message XSpace {
|
||||
repeated XPlane planes = 1;
|
||||
// Errors (if any) in the generation of planes.
|
||||
repeated string errors = 2;
|
||||
// Warnings (if any) in the generation of planes;
|
||||
repeated string warnings = 3;
|
||||
}
|
||||
|
||||
// An XPlane is a container of parallel timelines (XLines), generated by a
|
||||
|
@ -49,7 +49,7 @@ void PopulateStepDiagnostics(const OpStats& op_stats, Diagnostics* diag) {
|
||||
}
|
||||
|
||||
void PopulateOverviewDiagnostics(const OpStats& op_stats, Diagnostics* diag) {
|
||||
*diag->mutable_errors() = op_stats.errors();
|
||||
*diag->mutable_errors() = op_stats.diagnostics().errors();
|
||||
absl::c_sort(*diag->mutable_errors());
|
||||
if (diag->errors().empty()) {
|
||||
// Shows run-environment error only if there is no other existing error.
|
||||
@ -58,6 +58,7 @@ void PopulateOverviewDiagnostics(const OpStats& op_stats, Diagnostics* diag) {
|
||||
*diag->add_errors() = std::string(kNoDeviceTraceCollected);
|
||||
}
|
||||
}
|
||||
*diag->mutable_warnings() = op_stats.diagnostics().warnings();
|
||||
PopulateStepDiagnostics(op_stats, diag);
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user