[Profiler] Separate device collective communication time in the GPU step-time breakdown.
PiperOrigin-RevId: 332884401 Change-Id: Ia97fb9887d29f5faa2205ee97cfba67de87411b0
This commit is contained in:
parent
66fe41900b
commit
0451edab7d
@ -59,6 +59,7 @@ const double kNumPsPerMs = 1000000000.0;
|
|||||||
// input-bound; else if it is considered HIGHLY input-bound.
|
// input-bound; else if it is considered HIGHLY input-bound.
|
||||||
constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
|
constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
|
||||||
constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
|
constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
|
||||||
|
|
||||||
// If the percentage of step time that is due to outfeed is less than
|
// If the percentage of step time that is due to outfeed is less than
|
||||||
// kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
|
// kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
|
||||||
// output-bound; else if it is less than
|
// output-bound; else if it is less than
|
||||||
@ -66,6 +67,7 @@ constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
|
|||||||
// output-bound; else if it is considered HIGHLY output-bound.
|
// output-bound; else if it is considered HIGHLY output-bound.
|
||||||
constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
|
constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
|
||||||
constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
|
constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
|
||||||
|
|
||||||
// If the percentage of step time that is due to kernel launch is less than
|
// If the percentage of step time that is due to kernel launch is less than
|
||||||
// kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
|
// kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
|
||||||
// kernel-launch bound; else if it is less than
|
// kernel-launch bound; else if it is less than
|
||||||
@ -73,6 +75,7 @@ constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
|
|||||||
// kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
|
// kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
|
||||||
constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
|
constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
|
||||||
constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
|
constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
|
||||||
|
|
||||||
// If the percentage of step time that is due to all other time is less than
|
// If the percentage of step time that is due to all other time is less than
|
||||||
// kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
|
// kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
|
||||||
// all-other bound; else if it is less than
|
// all-other bound; else if it is less than
|
||||||
@ -80,6 +83,16 @@ constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
|
|||||||
// all-other bound; else if it is considered HIGHLY all-other bound.
|
// all-other bound; else if it is considered HIGHLY all-other bound.
|
||||||
constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
|
constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
|
||||||
constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
|
constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
|
||||||
|
|
||||||
|
// If the percentage of step time that is due to device collectives is less than
|
||||||
|
// kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
|
||||||
|
// device-collectives bound; else if it is less than
|
||||||
|
// kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
|
||||||
|
// device-collectives bound; else if it is considered HIGHLY device-collectives
|
||||||
|
// bound.
|
||||||
|
constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
|
||||||
|
constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
|
||||||
|
|
||||||
// Section number of the host-analysis section in the input-pipeline analysis.
|
// Section number of the host-analysis section in the input-pipeline analysis.
|
||||||
constexpr int kHostAnalysisSectionNumber = 3;
|
constexpr int kHostAnalysisSectionNumber = 3;
|
||||||
// Python-only explanation for "All Others" time.
|
// Python-only explanation for "All Others" time.
|
||||||
@ -125,6 +138,7 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
|
|||||||
Stat<double> output_ms;
|
Stat<double> output_ms;
|
||||||
Stat<double> device_compute_ms;
|
Stat<double> device_compute_ms;
|
||||||
Stat<double> device_to_device_ms;
|
Stat<double> device_to_device_ms;
|
||||||
|
Stat<double> device_collectives_ms;
|
||||||
Stat<double> host_compute_ms;
|
Stat<double> host_compute_ms;
|
||||||
Stat<double> host_prepare_ms;
|
Stat<double> host_prepare_ms;
|
||||||
Stat<double> host_compile_ms;
|
Stat<double> host_compile_ms;
|
||||||
@ -146,6 +160,7 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
|
|||||||
output_ms.UpdateStat(details.output_ms());
|
output_ms.UpdateStat(details.output_ms());
|
||||||
device_compute_ms.UpdateStat(details.device_compute_ms());
|
device_compute_ms.UpdateStat(details.device_compute_ms());
|
||||||
device_to_device_ms.UpdateStat(details.device_to_device_ms());
|
device_to_device_ms.UpdateStat(details.device_to_device_ms());
|
||||||
|
device_collectives_ms.UpdateStat(details.device_collectives_ms());
|
||||||
host_compute_ms.UpdateStat(details.host_compute_ms());
|
host_compute_ms.UpdateStat(details.host_compute_ms());
|
||||||
host_prepare_ms.UpdateStat(details.host_prepare_ms());
|
host_prepare_ms.UpdateStat(details.host_prepare_ms());
|
||||||
host_compile_ms.UpdateStat(details.host_compile_ms());
|
host_compile_ms.UpdateStat(details.host_compile_ms());
|
||||||
@ -162,6 +177,8 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
|
|||||||
GetStepSummaryForSampleStats(device_compute_ms);
|
GetStepSummaryForSampleStats(device_compute_ms);
|
||||||
*result.mutable_device_to_device_ms_summary() =
|
*result.mutable_device_to_device_ms_summary() =
|
||||||
GetStepSummaryForSampleStats(device_to_device_ms);
|
GetStepSummaryForSampleStats(device_to_device_ms);
|
||||||
|
*result.mutable_device_collectives_ms_summary() =
|
||||||
|
GetStepSummaryForSampleStats(device_collectives_ms);
|
||||||
*result.mutable_host_compute_ms_summary() =
|
*result.mutable_host_compute_ms_summary() =
|
||||||
GetStepSummaryForSampleStats(host_compute_ms);
|
GetStepSummaryForSampleStats(host_compute_ms);
|
||||||
*result.mutable_host_prepare_ms_summary() =
|
*result.mutable_host_prepare_ms_summary() =
|
||||||
@ -208,14 +225,13 @@ InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
|
|||||||
GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
|
GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
|
||||||
details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
|
details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
|
||||||
details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
|
details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
|
||||||
GetTimeInMs(type_ps, DEVICE_COMPUTE_32) +
|
GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
|
||||||
GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
|
|
||||||
details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
|
details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
|
||||||
GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
|
GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
|
||||||
|
details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
|
||||||
details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
|
details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
|
||||||
details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
|
details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
|
||||||
details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
|
details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
|
||||||
|
|
||||||
result.add_step_details()->PackFrom(details);
|
result.add_step_details()->PackFrom(details);
|
||||||
|
|
||||||
const double input_percent_of_step_time =
|
const double input_percent_of_step_time =
|
||||||
@ -360,6 +376,32 @@ double RatioOfHostToDeviceTimeToStepTime(
|
|||||||
return 0.0;
|
return 0.0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void DeviceCollectivesAnalysis(double device_collectives_percent,
|
||||||
|
std::string* device_collectives_classification,
|
||||||
|
std::string* device_collectives_statement) {
|
||||||
|
std::string percent_str =
|
||||||
|
absl::StrFormat("%.1lf", device_collectives_percent);
|
||||||
|
|
||||||
|
if (device_collectives_percent >=
|
||||||
|
kHighlyDeviceCollectivesBoundThresholdInPercent) {
|
||||||
|
*device_collectives_classification = "high";
|
||||||
|
*device_collectives_statement =
|
||||||
|
absl::StrCat(percent_str,
|
||||||
|
" % of the total step time sampled is spent on 'Device "
|
||||||
|
"Collective Communication'.");
|
||||||
|
} else if (device_collectives_percent >=
|
||||||
|
kModeratelyDeviceCollectivesBoundThresholdInPercent) {
|
||||||
|
*device_collectives_classification = "moderate";
|
||||||
|
*device_collectives_statement =
|
||||||
|
absl::StrCat(percent_str,
|
||||||
|
" % of the total step time sampled is spent on 'Device "
|
||||||
|
"Collective Communication'.");
|
||||||
|
} else {
|
||||||
|
*device_collectives_classification = "no";
|
||||||
|
*device_collectives_statement = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
|
void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
|
||||||
std::string* kernel_launch_classification,
|
std::string* kernel_launch_classification,
|
||||||
std::string* kernel_launch_statement) {
|
std::string* kernel_launch_statement) {
|
||||||
@ -660,6 +702,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
|
|||||||
double total_host_compile_ms = 0;
|
double total_host_compile_ms = 0;
|
||||||
double total_device_compute_ms = 0;
|
double total_device_compute_ms = 0;
|
||||||
double total_device_to_device_ms = 0;
|
double total_device_to_device_ms = 0;
|
||||||
|
double total_device_collectives_ms = 0;
|
||||||
double total_unknown_ms = 0;
|
double total_unknown_ms = 0;
|
||||||
|
|
||||||
for (const google::protobuf::Any& step_details : any_step_details) {
|
for (const google::protobuf::Any& step_details : any_step_details) {
|
||||||
@ -677,6 +720,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
|
|||||||
total_host_prepare_ms += details.host_prepare_ms();
|
total_host_prepare_ms += details.host_prepare_ms();
|
||||||
total_device_compute_ms += details.device_compute_ms();
|
total_device_compute_ms += details.device_compute_ms();
|
||||||
total_device_to_device_ms += details.device_to_device_ms();
|
total_device_to_device_ms += details.device_to_device_ms();
|
||||||
|
total_device_collectives_ms += details.device_collectives_ms();
|
||||||
total_host_compute_ms += details.host_compute_ms();
|
total_host_compute_ms += details.host_compute_ms();
|
||||||
total_host_compile_ms += details.host_compile_ms();
|
total_host_compile_ms += details.host_compile_ms();
|
||||||
total_unknown_ms += details.unknown_time_ms();
|
total_unknown_ms += details.unknown_time_ms();
|
||||||
@ -692,24 +736,37 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
|
|||||||
analysis.set_kernel_launch_statement("");
|
analysis.set_kernel_launch_statement("");
|
||||||
analysis.set_all_other_classification("no");
|
analysis.set_all_other_classification("no");
|
||||||
analysis.set_all_other_statement("");
|
analysis.set_all_other_statement("");
|
||||||
|
analysis.set_device_collectives_classification("no");
|
||||||
|
analysis.set_device_collectives_statement("");
|
||||||
return analysis;
|
return analysis;
|
||||||
}
|
}
|
||||||
double input_percent = 100.0 * total_input_ms / total_step_time_ms;
|
double input_percent = 100.0 * total_input_ms / total_step_time_ms;
|
||||||
double output_percent = 100.0 * total_output_ms / total_step_time_ms;
|
double output_percent = 100.0 * total_output_ms / total_step_time_ms;
|
||||||
double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
|
double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
|
||||||
|
double device_collectives_percent =
|
||||||
|
100.0 * total_device_collectives_ms / total_step_time_ms;
|
||||||
|
|
||||||
// idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
|
// idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
|
||||||
// host compute, host compile, and unknown.
|
// host compute, host compile, and unknown.
|
||||||
double idle_percent =
|
double idle_percent =
|
||||||
std::max(0.0, 100.0 - input_percent - output_percent - compute_percent);
|
std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
|
||||||
|
device_collectives_percent);
|
||||||
double kernel_launch_percent =
|
double kernel_launch_percent =
|
||||||
100.0 * total_host_prepare_ms / total_step_time_ms;
|
100.0 * total_host_prepare_ms / total_step_time_ms;
|
||||||
double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
|
double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
|
||||||
|
|
||||||
std::string input_classification;
|
std::string input_classification;
|
||||||
std::string input_statement;
|
std::string input_statement;
|
||||||
bool all_other_reported =
|
bool all_other_reported =
|
||||||
InputAnalysis(input_percent, all_other_percent, &input_classification,
|
InputAnalysis(input_percent, all_other_percent, &input_classification,
|
||||||
&input_statement);
|
&input_statement);
|
||||||
|
|
||||||
|
std::string device_collectives_classification;
|
||||||
|
std::string device_collectives_statement;
|
||||||
|
DeviceCollectivesAnalysis(device_collectives_percent,
|
||||||
|
&device_collectives_classification,
|
||||||
|
&device_collectives_statement);
|
||||||
|
|
||||||
std::string kernel_launch_classification;
|
std::string kernel_launch_classification;
|
||||||
std::string kernel_launch_statement;
|
std::string kernel_launch_statement;
|
||||||
KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
|
KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
|
||||||
@ -732,6 +789,10 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
|
|||||||
analysis.set_kernel_launch_statement(kernel_launch_statement);
|
analysis.set_kernel_launch_statement(kernel_launch_statement);
|
||||||
analysis.set_all_other_classification(all_other_classification);
|
analysis.set_all_other_classification(all_other_classification);
|
||||||
analysis.set_all_other_statement(all_other_statement);
|
analysis.set_all_other_statement(all_other_statement);
|
||||||
|
analysis.set_device_collectives_classification(
|
||||||
|
device_collectives_classification);
|
||||||
|
analysis.set_device_collectives_statement(device_collectives_statement);
|
||||||
|
|
||||||
return analysis;
|
return analysis;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -154,6 +154,10 @@ OverviewPageRecommendation ComputeGenericRecommendation(
|
|||||||
const PrecisionStats& precision_stats) {
|
const PrecisionStats& precision_stats) {
|
||||||
OverviewPageRecommendation re;
|
OverviewPageRecommendation re;
|
||||||
GenericRecommendation generic;
|
GenericRecommendation generic;
|
||||||
|
generic.set_device_collectives_bottleneck(
|
||||||
|
bottleneck.device_collectives_classification());
|
||||||
|
generic.set_device_collectives_statement(
|
||||||
|
bottleneck.device_collectives_statement());
|
||||||
generic.set_kernel_launch_bottleneck(
|
generic.set_kernel_launch_bottleneck(
|
||||||
bottleneck.kernel_launch_classification());
|
bottleneck.kernel_launch_classification());
|
||||||
generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement());
|
generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement());
|
||||||
|
@ -30,6 +30,12 @@ message BottleneckAnalysis {
|
|||||||
string all_other_classification = 5;
|
string all_other_classification = 5;
|
||||||
// A human-readable description of the all other overhead.
|
// A human-readable description of the all other overhead.
|
||||||
string all_other_statement = 6;
|
string all_other_statement = 6;
|
||||||
|
// Indicates if device collective communication is a bottleneck. Possible
|
||||||
|
// values: "no", "moderate", "high".
|
||||||
|
string device_collectives_classification = 11;
|
||||||
|
// A human-readable description of the device collective communication
|
||||||
|
// overhead.
|
||||||
|
string device_collectives_statement = 12;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Used for both step duration and Op duration.
|
// Used for both step duration and Op duration.
|
||||||
@ -60,6 +66,8 @@ message PerGenericStepDetails {
|
|||||||
double device_compute_ms = 6;
|
double device_compute_ms = 6;
|
||||||
// The device-to-device communication time (in ms).
|
// The device-to-device communication time (in ms).
|
||||||
double device_to_device_ms = 7;
|
double device_to_device_ms = 7;
|
||||||
|
// The device time spent on collective communications (in ms).
|
||||||
|
double device_collectives_ms = 13;
|
||||||
// The host-compute time (in ms).
|
// The host-compute time (in ms).
|
||||||
double host_compute_ms = 8;
|
double host_compute_ms = 8;
|
||||||
// The host-prepare time (in ms).
|
// The host-prepare time (in ms).
|
||||||
@ -129,6 +137,8 @@ message GenericStepTimeBreakdown {
|
|||||||
StepSummary device_compute_ms_summary = 4;
|
StepSummary device_compute_ms_summary = 4;
|
||||||
// Summary of all device-to-device time as a part of step in ms.
|
// Summary of all device-to-device time as a part of step in ms.
|
||||||
StepSummary device_to_device_ms_summary = 5;
|
StepSummary device_to_device_ms_summary = 5;
|
||||||
|
// Summary of all device-collectives time as a part of step in ms.
|
||||||
|
StepSummary device_collectives_ms_summary = 12;
|
||||||
// Summary of all host-compute time as a part of step in ms.
|
// Summary of all host-compute time as a part of step in ms.
|
||||||
StepSummary host_compute_ms_summary = 6;
|
StepSummary host_compute_ms_summary = 6;
|
||||||
// Summary of all host-prepare time as a part of step in ms.
|
// Summary of all host-prepare time as a part of step in ms.
|
||||||
|
@ -87,6 +87,12 @@ message GenericRecommendation {
|
|||||||
// A statement that recommends if the user should try using lower precision.
|
// A statement that recommends if the user should try using lower precision.
|
||||||
// Shows this statement to users only if it is not empty.
|
// Shows this statement to users only if it is not empty.
|
||||||
string precision_statement = 5;
|
string precision_statement = 5;
|
||||||
|
// Indicates if device collectives are a performance bottleneck. Possible
|
||||||
|
// values: "no", "moderate", "high".
|
||||||
|
string device_collectives_bottleneck = 6;
|
||||||
|
// A statement that recommends if we need to further investigate
|
||||||
|
// device-collectives performance.
|
||||||
|
string device_collectives_statement = 7;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Overview result for the recommendation section.
|
// Overview result for the recommendation section.
|
||||||
|
@ -46,14 +46,17 @@ enum EventType {
|
|||||||
HOST_TO_DEVICE = 40,
|
HOST_TO_DEVICE = 40,
|
||||||
// Host is preparing to launch a computation on device.
|
// Host is preparing to launch a computation on device.
|
||||||
HOST_PREPARE = 50,
|
HOST_PREPARE = 50,
|
||||||
// Host is waiting for input.
|
// Assigns a smaller priority to DEVICE_COLLECTIVES than HOST_WAIT_INPUT,
|
||||||
HOST_WAIT_INPUT = 60,
|
// because if an all-reduce event is overlapped with an host-wait-input event,
|
||||||
// Device-to-device communication.
|
// we want to count it as waiting for input.
|
||||||
DEVICE_TO_DEVICE = 70,
|
|
||||||
// Device-to-host communication.
|
|
||||||
DEVICE_TO_HOST = 80,
|
|
||||||
// Collective Ops such as All-Reduce.
|
// Collective Ops such as All-Reduce.
|
||||||
DEVICE_COLLECTIVES = 90,
|
DEVICE_COLLECTIVES = 60,
|
||||||
|
// Host is waiting for input.
|
||||||
|
HOST_WAIT_INPUT = 70,
|
||||||
|
// Device-to-device communication.
|
||||||
|
DEVICE_TO_DEVICE = 80,
|
||||||
|
// Device-to-host communication.
|
||||||
|
DEVICE_TO_HOST = 90,
|
||||||
// Device is computing with 32-bit precision.
|
// Device is computing with 32-bit precision.
|
||||||
DEVICE_COMPUTE_32 = 100,
|
DEVICE_COMPUTE_32 = 100,
|
||||||
// Device is computing with 16-bit precision.
|
// Device is computing with 16-bit precision.
|
||||||
|
Loading…
Reference in New Issue
Block a user