[Profiler] Separate device collective communication time in the GPU step-time breakdown.
PiperOrigin-RevId: 332884401 Change-Id: Ia97fb9887d29f5faa2205ee97cfba67de87411b0
This commit is contained in:
parent
66fe41900b
commit
0451edab7d
@ -59,6 +59,7 @@ const double kNumPsPerMs = 1000000000.0;
|
||||
// input-bound; else if it is considered HIGHLY input-bound.
|
||||
constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
|
||||
constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
|
||||
|
||||
// If the percentage of step time that is due to outfeed is less than
|
||||
// kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
|
||||
// output-bound; else if it is less than
|
||||
@ -66,6 +67,7 @@ constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
|
||||
// output-bound; else if it is considered HIGHLY output-bound.
|
||||
constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
|
||||
constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
|
||||
|
||||
// If the percentage of step time that is due to kernel launch is less than
|
||||
// kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
|
||||
// kernel-launch bound; else if it is less than
|
||||
@ -73,6 +75,7 @@ constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
|
||||
// kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
|
||||
constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
|
||||
constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
|
||||
|
||||
// If the percentage of step time that is due to all other time is less than
|
||||
// kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
|
||||
// all-other bound; else if it is less than
|
||||
@ -80,6 +83,16 @@ constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
|
||||
// all-other bound; else if it is considered HIGHLY all-other bound.
|
||||
constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
|
||||
constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
|
||||
|
||||
// If the percentage of step time that is due to device collectives is less than
|
||||
// kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
|
||||
// device-collectives bound; else if it is less than
|
||||
// kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
|
||||
// device-collectives bound; else if it is considered HIGHLY device-collectives
|
||||
// bound.
|
||||
constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
|
||||
constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
|
||||
|
||||
// Section number of the host-analysis section in the input-pipeline analysis.
|
||||
constexpr int kHostAnalysisSectionNumber = 3;
|
||||
// Python-only explanation for "All Others" time.
|
||||
@ -125,6 +138,7 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
|
||||
Stat<double> output_ms;
|
||||
Stat<double> device_compute_ms;
|
||||
Stat<double> device_to_device_ms;
|
||||
Stat<double> device_collectives_ms;
|
||||
Stat<double> host_compute_ms;
|
||||
Stat<double> host_prepare_ms;
|
||||
Stat<double> host_compile_ms;
|
||||
@ -146,6 +160,7 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
|
||||
output_ms.UpdateStat(details.output_ms());
|
||||
device_compute_ms.UpdateStat(details.device_compute_ms());
|
||||
device_to_device_ms.UpdateStat(details.device_to_device_ms());
|
||||
device_collectives_ms.UpdateStat(details.device_collectives_ms());
|
||||
host_compute_ms.UpdateStat(details.host_compute_ms());
|
||||
host_prepare_ms.UpdateStat(details.host_prepare_ms());
|
||||
host_compile_ms.UpdateStat(details.host_compile_ms());
|
||||
@ -162,6 +177,8 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
|
||||
GetStepSummaryForSampleStats(device_compute_ms);
|
||||
*result.mutable_device_to_device_ms_summary() =
|
||||
GetStepSummaryForSampleStats(device_to_device_ms);
|
||||
*result.mutable_device_collectives_ms_summary() =
|
||||
GetStepSummaryForSampleStats(device_collectives_ms);
|
||||
*result.mutable_host_compute_ms_summary() =
|
||||
GetStepSummaryForSampleStats(host_compute_ms);
|
||||
*result.mutable_host_prepare_ms_summary() =
|
||||
@ -208,14 +225,13 @@ InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
|
||||
GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
|
||||
details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
|
||||
details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
|
||||
GetTimeInMs(type_ps, DEVICE_COMPUTE_32) +
|
||||
GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
|
||||
GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
|
||||
details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
|
||||
GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
|
||||
details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
|
||||
details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
|
||||
details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
|
||||
details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
|
||||
|
||||
result.add_step_details()->PackFrom(details);
|
||||
|
||||
const double input_percent_of_step_time =
|
||||
@ -360,6 +376,32 @@ double RatioOfHostToDeviceTimeToStepTime(
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
void DeviceCollectivesAnalysis(double device_collectives_percent,
|
||||
std::string* device_collectives_classification,
|
||||
std::string* device_collectives_statement) {
|
||||
std::string percent_str =
|
||||
absl::StrFormat("%.1lf", device_collectives_percent);
|
||||
|
||||
if (device_collectives_percent >=
|
||||
kHighlyDeviceCollectivesBoundThresholdInPercent) {
|
||||
*device_collectives_classification = "high";
|
||||
*device_collectives_statement =
|
||||
absl::StrCat(percent_str,
|
||||
" % of the total step time sampled is spent on 'Device "
|
||||
"Collective Communication'.");
|
||||
} else if (device_collectives_percent >=
|
||||
kModeratelyDeviceCollectivesBoundThresholdInPercent) {
|
||||
*device_collectives_classification = "moderate";
|
||||
*device_collectives_statement =
|
||||
absl::StrCat(percent_str,
|
||||
" % of the total step time sampled is spent on 'Device "
|
||||
"Collective Communication'.");
|
||||
} else {
|
||||
*device_collectives_classification = "no";
|
||||
*device_collectives_statement = "";
|
||||
}
|
||||
}
|
||||
|
||||
void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
|
||||
std::string* kernel_launch_classification,
|
||||
std::string* kernel_launch_statement) {
|
||||
@ -660,6 +702,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
|
||||
double total_host_compile_ms = 0;
|
||||
double total_device_compute_ms = 0;
|
||||
double total_device_to_device_ms = 0;
|
||||
double total_device_collectives_ms = 0;
|
||||
double total_unknown_ms = 0;
|
||||
|
||||
for (const google::protobuf::Any& step_details : any_step_details) {
|
||||
@ -677,6 +720,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
|
||||
total_host_prepare_ms += details.host_prepare_ms();
|
||||
total_device_compute_ms += details.device_compute_ms();
|
||||
total_device_to_device_ms += details.device_to_device_ms();
|
||||
total_device_collectives_ms += details.device_collectives_ms();
|
||||
total_host_compute_ms += details.host_compute_ms();
|
||||
total_host_compile_ms += details.host_compile_ms();
|
||||
total_unknown_ms += details.unknown_time_ms();
|
||||
@ -692,24 +736,37 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
|
||||
analysis.set_kernel_launch_statement("");
|
||||
analysis.set_all_other_classification("no");
|
||||
analysis.set_all_other_statement("");
|
||||
analysis.set_device_collectives_classification("no");
|
||||
analysis.set_device_collectives_statement("");
|
||||
return analysis;
|
||||
}
|
||||
double input_percent = 100.0 * total_input_ms / total_step_time_ms;
|
||||
double output_percent = 100.0 * total_output_ms / total_step_time_ms;
|
||||
double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
|
||||
double device_collectives_percent =
|
||||
100.0 * total_device_collectives_ms / total_step_time_ms;
|
||||
|
||||
// idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
|
||||
// host compute, host compile, and unknown.
|
||||
double idle_percent =
|
||||
std::max(0.0, 100.0 - input_percent - output_percent - compute_percent);
|
||||
std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
|
||||
device_collectives_percent);
|
||||
double kernel_launch_percent =
|
||||
100.0 * total_host_prepare_ms / total_step_time_ms;
|
||||
double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
|
||||
|
||||
std::string input_classification;
|
||||
std::string input_statement;
|
||||
bool all_other_reported =
|
||||
InputAnalysis(input_percent, all_other_percent, &input_classification,
|
||||
&input_statement);
|
||||
|
||||
std::string device_collectives_classification;
|
||||
std::string device_collectives_statement;
|
||||
DeviceCollectivesAnalysis(device_collectives_percent,
|
||||
&device_collectives_classification,
|
||||
&device_collectives_statement);
|
||||
|
||||
std::string kernel_launch_classification;
|
||||
std::string kernel_launch_statement;
|
||||
KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
|
||||
@ -732,6 +789,10 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
|
||||
analysis.set_kernel_launch_statement(kernel_launch_statement);
|
||||
analysis.set_all_other_classification(all_other_classification);
|
||||
analysis.set_all_other_statement(all_other_statement);
|
||||
analysis.set_device_collectives_classification(
|
||||
device_collectives_classification);
|
||||
analysis.set_device_collectives_statement(device_collectives_statement);
|
||||
|
||||
return analysis;
|
||||
}
|
||||
|
||||
|
@ -154,6 +154,10 @@ OverviewPageRecommendation ComputeGenericRecommendation(
|
||||
const PrecisionStats& precision_stats) {
|
||||
OverviewPageRecommendation re;
|
||||
GenericRecommendation generic;
|
||||
generic.set_device_collectives_bottleneck(
|
||||
bottleneck.device_collectives_classification());
|
||||
generic.set_device_collectives_statement(
|
||||
bottleneck.device_collectives_statement());
|
||||
generic.set_kernel_launch_bottleneck(
|
||||
bottleneck.kernel_launch_classification());
|
||||
generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement());
|
||||
|
@ -30,6 +30,12 @@ message BottleneckAnalysis {
|
||||
string all_other_classification = 5;
|
||||
// A human-readable description of the all other overhead.
|
||||
string all_other_statement = 6;
|
||||
// Indicates if device collective communication is a bottleneck. Possible
|
||||
// values: "no", "moderate", "high".
|
||||
string device_collectives_classification = 11;
|
||||
// A human-readable description of the device collective communication
|
||||
// overhead.
|
||||
string device_collectives_statement = 12;
|
||||
}
|
||||
|
||||
// Used for both step duration and Op duration.
|
||||
@ -60,6 +66,8 @@ message PerGenericStepDetails {
|
||||
double device_compute_ms = 6;
|
||||
// The device-to-device communication time (in ms).
|
||||
double device_to_device_ms = 7;
|
||||
// The device time spent on collective communications (in ms).
|
||||
double device_collectives_ms = 13;
|
||||
// The host-compute time (in ms).
|
||||
double host_compute_ms = 8;
|
||||
// The host-prepare time (in ms).
|
||||
@ -129,6 +137,8 @@ message GenericStepTimeBreakdown {
|
||||
StepSummary device_compute_ms_summary = 4;
|
||||
// Summary of all device-to-device time as a part of step in ms.
|
||||
StepSummary device_to_device_ms_summary = 5;
|
||||
// Summary of all device-collectives time as a part of step in ms.
|
||||
StepSummary device_collectives_ms_summary = 12;
|
||||
// Summary of all host-compute time as a part of step in ms.
|
||||
StepSummary host_compute_ms_summary = 6;
|
||||
// Summary of all host-prepare time as a part of step in ms.
|
||||
|
@ -87,6 +87,12 @@ message GenericRecommendation {
|
||||
// A statement that recommends if the user should try using lower precision.
|
||||
// Shows this statement to users only if it is not empty.
|
||||
string precision_statement = 5;
|
||||
// Indicates if device collectives are a performance bottleneck. Possible
|
||||
// values: "no", "moderate", "high".
|
||||
string device_collectives_bottleneck = 6;
|
||||
// A statement that recommends if we need to further investigate
|
||||
// device-collectives performance.
|
||||
string device_collectives_statement = 7;
|
||||
}
|
||||
|
||||
// Overview result for the recommendation section.
|
||||
|
@ -46,14 +46,17 @@ enum EventType {
|
||||
HOST_TO_DEVICE = 40,
|
||||
// Host is preparing to launch a computation on device.
|
||||
HOST_PREPARE = 50,
|
||||
// Host is waiting for input.
|
||||
HOST_WAIT_INPUT = 60,
|
||||
// Device-to-device communication.
|
||||
DEVICE_TO_DEVICE = 70,
|
||||
// Device-to-host communication.
|
||||
DEVICE_TO_HOST = 80,
|
||||
// Assigns a smaller priority to DEVICE_COLLECTIVES than HOST_WAIT_INPUT,
|
||||
// because if an all-reduce event is overlapped with an host-wait-input event,
|
||||
// we want to count it as waiting for input.
|
||||
// Collective Ops such as All-Reduce.
|
||||
DEVICE_COLLECTIVES = 90,
|
||||
DEVICE_COLLECTIVES = 60,
|
||||
// Host is waiting for input.
|
||||
HOST_WAIT_INPUT = 70,
|
||||
// Device-to-device communication.
|
||||
DEVICE_TO_DEVICE = 80,
|
||||
// Device-to-host communication.
|
||||
DEVICE_TO_HOST = 90,
|
||||
// Device is computing with 32-bit precision.
|
||||
DEVICE_COMPUTE_32 = 100,
|
||||
// Device is computing with 16-bit precision.
|
||||
|
Loading…
Reference in New Issue
Block a user