Surface num dropped events as a warning message.

PiperOrigin-RevId: 316138752
Change-Id: I3ea9d2c17df0dfd40c23a32ce5b1b02a80c0a9ff
This commit is contained in:
A. Unique TensorFlower 2020-06-12 11:12:38 -07:00 committed by TensorFlower Gardener
parent 76fa5e8a4a
commit 2dac8cf550
11 changed files with 73 additions and 21 deletions

View File

@ -228,6 +228,7 @@ cc_library(
":xplane_to_step_events",
":xplane_to_tf_functions",
"//tensorflow/core:lib",
"//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
"//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
"//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
"//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
@ -261,6 +262,7 @@ tf_cc_test(
"//tensorflow/core:test",
"//tensorflow/core:test_main",
"//tensorflow/core:testlib",
"//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
"//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
"//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
"//tensorflow/core/profiler/protobuf:steps_db_proto_cc",

View File

@ -26,6 +26,7 @@ limitations under the License.
#include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
#include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
#include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
@ -114,11 +115,20 @@ void ProcessHostPlane(const XPlane* host_plane, bool use_device_step_events,
} // namespace
void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats) {
if (space.errors().empty()) return;
void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
OpStats* op_stats) {
if (!space.errors().empty()) {
absl::flat_hash_set<std::string> unique_errors;
unique_errors.insert(space.errors().begin(), space.errors().end());
*op_stats->mutable_errors() = {unique_errors.begin(), unique_errors.end()};
*op_stats->mutable_diagnostics()->mutable_errors() = {unique_errors.begin(),
unique_errors.end()};
}
if (!space.warnings().empty()) {
absl::flat_hash_set<std::string> unique_warnings;
unique_warnings.insert(space.warnings().begin(), space.warnings().end());
*op_stats->mutable_diagnostics()->mutable_warnings() = {
unique_warnings.begin(), unique_warnings.end()};
}
}
OpStats ConvertXSpaceToOpStats(const XSpace& space) {
@ -127,7 +137,7 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space) {
FindPlanesWithPrefix(space, kGpuPlanePrefix);
OpStats op_stats;
StepEvents step_events;
PropagateXSpaceErrorsToOpStats(space, &op_stats);
PropagateXSpaceDiagnosticsToOpStats(space, &op_stats);
// Convert device planes.
OpMetricsDbCombiner op_metrics_db_combiner(
op_stats.mutable_device_op_metrics_db());

View File

@ -25,8 +25,9 @@ namespace profiler {
// NOTE: call GroupTfEvents before if OpStats.step_db needs to be generated.
OpStats ConvertXSpaceToOpStats(const XSpace& space);
// Propagate and dedup the errors in XSpace and add to OpStats.
void PropagateXSpaceErrorsToOpStats(const XSpace& space, OpStats* op_stats);
// Propagate and dedup the diagnostics in XSpace and add to OpStats.
void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
OpStats* op_stats);
} // namespace profiler
} // namespace tensorflow

View File

@ -18,6 +18,7 @@ limitations under the License.
#include "absl/strings/str_cat.h"
#include "tensorflow/core/platform/test.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
@ -200,8 +201,8 @@ TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) {
OpStats op_stats = ConvertXSpaceToOpStats(space);
EXPECT_EQ(1, op_stats.errors_size());
EXPECT_EQ(kError, op_stats.errors(/*index=*/0));
EXPECT_EQ(1, op_stats.diagnostics().errors_size());
EXPECT_EQ(kError, op_stats.diagnostics().errors(/*index=*/0));
}
} // namespace

View File

@ -47,6 +47,7 @@ tf_cuda_library(
"//tensorflow/core/profiler/utils:xplane_schema",
"//tensorflow/core/profiler/utils:xplane_utils",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/synchronization",
],
alwayslink = 1,
)

View File

@ -1668,7 +1668,7 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
if (dropped != 0) {
uint32 device_id = -1;
RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
collector_->OnEventsDropped("CUpti activity buffer", dropped);
collector_->OnEventsDropped("cupti activity buffer full", dropped);
}
return Status::OK();
}

View File

@ -26,10 +26,12 @@ limitations under the License.
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/str_join.h"
#include "absl/synchronization/mutex.h"
#include "tensorflow/core/framework/step_stats.pb.h"
#include "tensorflow/core/platform/abi.h"
#include "tensorflow/core/platform/env_time.h"
#include "tensorflow/core/platform/errors.h"
#include "tensorflow/core/platform/host_info.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/thread_annotations.h"
@ -189,25 +191,28 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
if (event.device_id >= num_gpus_) return;
if (event.source == CuptiTracerEventSource::DriverCallback) {
if (num_callback_events_ > options_.max_callback_api_events) {
OnEventsDropped("trace collector", 1);
OnEventsDropped("total driver(callback) events reaches max", 1);
return;
}
num_callback_events_++;
} else {
if (num_activity_events_ > options_.max_activity_api_events) {
OnEventsDropped("trace collector", 1);
OnEventsDropped("total device(activity) events reaches max", 1);
return;
}
num_activity_events_++;
}
per_device_collector_[event.device_id].AddEvent(std::move(event));
}
void OnEventsDropped(const std::string& reason, uint32 num_events) override {}
void OnEventsDropped(const std::string& reason, uint32 num_events) override {
absl::MutexLock lock(&mutex_);
dropped_events_[reason] += num_events;
}
void Flush() override {}
void Export(StepStats* step_stats) {
LOG(INFO) << " GpuTracer has collected " << num_callback_events_
<< " callback api events and " << num_activity_events_
<< " activity events.";
<< " activity events. " << ReportDroppedEvents();
for (int i = 0; i < num_gpus_; ++i) {
per_device_collector_[i].Flush(i, start_walltime_ns_, start_gpu_ns_,
step_stats);
@ -216,7 +221,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
void Export(XSpace* space) {
LOG(INFO) << " GpuTracer has collected " << num_callback_events_
<< " callback api events and " << num_activity_events_
<< " activity events.";
<< " activity events. " << ReportDroppedEvents();
uint64 end_gpu_ns = CuptiTracer::GetTimestamp();
XPlaneBuilder host_plane(GetOrCreatePlane(space, kCuptiDriverApiPlaneName));
host_plane.SetId(kCuptiDriverApiPlaneId);
@ -232,10 +237,32 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
}
NormalizeTimeStamps(&host_plane, start_walltime_ns_);
}
std::string ReportDroppedEvents() {
absl::MutexLock lock(&mutex_);
string result;
for (const auto dropped : dropped_events_) {
absl::StrAppend(&result, " ", dropped.second, " events dropped because ",
dropped.first, ";");
}
if (!result.empty()) result.back() = '.';
return result;
}
std::string ReportNumEventsIfDropped() {
std::string events_dropped = ReportDroppedEvents();
if (events_dropped.empty()) return "";
return absl::StrCat("Detected GPU events dropped on ", port::Hostname(),
": Profiler has collected ",
num_callback_events_.load(), " driver events and ",
num_activity_events_.load(), " device events.",
events_dropped);
}
private:
std::atomic<int> num_callback_events_;
std::atomic<int> num_activity_events_;
absl::Mutex mutex_;
absl::flat_hash_map<std::string, uint64> dropped_events_
ABSL_GUARDED_BY(mutex_);
uint64 start_walltime_ns_;
uint64 start_gpu_ns_;
int num_gpus_;
@ -669,7 +696,11 @@ Status GpuTracer::CollectData(XSpace* space) {
case State::kStoppedOk: {
std::string cupti_error = CuptiTracer::ErrorIfAny();
if (!cupti_error.empty()) {
space->add_errors(cupti_error);
space->add_errors(std::move(cupti_error));
}
std::string events_dropped = cupti_collector_->ReportNumEventsIfDropped();
if (!events_dropped.empty()) {
space->add_warnings(std::move(events_dropped));
}
if (cupti_collector_) {
cupti_collector_->Export(space);

View File

@ -87,6 +87,7 @@ tf_proto_library(
srcs = ["op_stats.proto"],
cc_api_version = 2,
protodeps = [
":diagnostics_proto",
":kernel_stats_proto",
":op_metrics_proto",
":steps_db_proto",

View File

@ -2,6 +2,7 @@ syntax = "proto3";
package tensorflow.profiler;
import "tensorflow/core/profiler/protobuf/diagnostics.proto";
import "tensorflow/core/profiler/protobuf/kernel_stats.proto";
import "tensorflow/core/profiler/protobuf/op_metrics.proto";
import "tensorflow/core/profiler/protobuf/steps_db.proto";
@ -107,6 +108,7 @@ message OpStats {
KernelStatsDb kernel_stats_db = 6;
// Statistics for all tf-functions.
TfFunctionDb tf_function_db = 8;
// Errors seen.
repeated string errors = 7;
// Error and warning messages for diagnosing profiling issues.
Diagnostics diagnostics = 9;
reserved 7;
}

View File

@ -5,11 +5,13 @@ package tensorflow.profiler;
option cc_enable_arenas = true;
// A container of parallel XPlanes, generated by one or more profiling sources.
// Next ID: 3
// Next ID: 4
message XSpace {
repeated XPlane planes = 1;
// Errors (if any) in the generation of planes.
repeated string errors = 2;
// Warnings (if any) in the generation of planes;
repeated string warnings = 3;
}
// An XPlane is a container of parallel timelines (XLines), generated by a

View File

@ -49,7 +49,7 @@ void PopulateStepDiagnostics(const OpStats& op_stats, Diagnostics* diag) {
}
void PopulateOverviewDiagnostics(const OpStats& op_stats, Diagnostics* diag) {
*diag->mutable_errors() = op_stats.errors();
*diag->mutable_errors() = op_stats.diagnostics().errors();
absl::c_sort(*diag->mutable_errors());
if (diag->errors().empty()) {
// Shows run-environment error only if there is no other existing error.
@ -58,6 +58,7 @@ void PopulateOverviewDiagnostics(const OpStats& op_stats, Diagnostics* diag) {
*diag->add_errors() = std::string(kNoDeviceTraceCollected);
}
}
*diag->mutable_warnings() = op_stats.diagnostics().warnings();
PopulateStepDiagnostics(op_stats, diag);
}