[Profiler] Separate device collective communication time in the GPU step-time breakdown.

PiperOrigin-RevId: 332884401 Change-Id: Ia97fb9887d29f5faa2205ee97cfba67de87411b0
2020-09-21 10:46:25 -07:00 · 2020-09-21 10:46:25 -07:00 · 0451edab7d
commit 0451edab7d
parent 66fe41900b
5 changed files with 95 additions and 11 deletions
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@ -59,6 +59,7 @@ const double kNumPsPerMs = 1000000000.0;
 // input-bound; else if it is considered HIGHLY input-bound.
 constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
 constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
+
 // If the percentage of step time that is due to outfeed is less than
 // kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
 // output-bound; else if it is less than
@ -66,6 +67,7 @@ constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
 // output-bound; else if it is considered HIGHLY output-bound.
 constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
 constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
+
 // If the percentage of step time that is due to kernel launch is less than
 // kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
 // kernel-launch bound; else if it is less than
@ -73,6 +75,7 @@ constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
 // kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
 constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
 constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
+
 // If the percentage of step time that is due to all other time is less than
 // kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
 // all-other bound; else if it is less than
@ -80,6 +83,16 @@ constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
 // all-other bound; else if it is considered HIGHLY all-other bound.
 constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
 constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
+
+// If the percentage of step time that is due to device collectives is less than
+// kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
+// device-collectives bound; else if it is less than
+// kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
+// device-collectives  bound; else if it is considered HIGHLY device-collectives
+// bound.
+constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
+constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
+
 // Section number of the host-analysis section in the input-pipeline analysis.
 constexpr int kHostAnalysisSectionNumber = 3;
 // Python-only explanation for "All Others" time.
@ -125,6 +138,7 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
  Stat<double> output_ms;
  Stat<double> device_compute_ms;
  Stat<double> device_to_device_ms;
+  Stat<double> device_collectives_ms;
  Stat<double> host_compute_ms;
  Stat<double> host_prepare_ms;
  Stat<double> host_compile_ms;
@ -146,6 +160,7 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
    output_ms.UpdateStat(details.output_ms());
    device_compute_ms.UpdateStat(details.device_compute_ms());
    device_to_device_ms.UpdateStat(details.device_to_device_ms());
+    device_collectives_ms.UpdateStat(details.device_collectives_ms());
    host_compute_ms.UpdateStat(details.host_compute_ms());
    host_prepare_ms.UpdateStat(details.host_prepare_ms());
    host_compile_ms.UpdateStat(details.host_compile_ms());
@ -162,6 +177,8 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
      GetStepSummaryForSampleStats(device_compute_ms);
  *result.mutable_device_to_device_ms_summary() =
      GetStepSummaryForSampleStats(device_to_device_ms);
+  *result.mutable_device_collectives_ms_summary() =
+      GetStepSummaryForSampleStats(device_collectives_ms);
  *result.mutable_host_compute_ms_summary() =
      GetStepSummaryForSampleStats(host_compute_ms);
  *result.mutable_host_prepare_ms_summary() =
@ -208,14 +225,13 @@ InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
                                  GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
    details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
    details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
-                                  GetTimeInMs(type_ps, DEVICE_COMPUTE_32) +
-                                  GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
+                                  GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
    details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
                                    GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
+    details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
    details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
    details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
    details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
-
    result.add_step_details()->PackFrom(details);

    const double input_percent_of_step_time =
@ -360,6 +376,32 @@ double RatioOfHostToDeviceTimeToStepTime(
  return 0.0;
 }

+void DeviceCollectivesAnalysis(double device_collectives_percent,
+                               std::string* device_collectives_classification,
+                               std::string* device_collectives_statement) {
+  std::string percent_str =
+      absl::StrFormat("%.1lf", device_collectives_percent);
+
+  if (device_collectives_percent >=
+      kHighlyDeviceCollectivesBoundThresholdInPercent) {
+    *device_collectives_classification = "high";
+    *device_collectives_statement =
+        absl::StrCat(percent_str,
+                     " % of the total step time sampled is spent on 'Device "
+                     "Collective Communication'.");
+  } else if (device_collectives_percent >=
+             kModeratelyDeviceCollectivesBoundThresholdInPercent) {
+    *device_collectives_classification = "moderate";
+    *device_collectives_statement =
+        absl::StrCat(percent_str,
+                     " % of the total step time sampled is spent on 'Device "
+                     "Collective Communication'.");
+  } else {
+    *device_collectives_classification = "no";
+    *device_collectives_statement = "";
+  }
+}
+
 void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
                          std::string* kernel_launch_classification,
                          std::string* kernel_launch_statement) {
@ -660,6 +702,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
  double total_host_compile_ms = 0;
  double total_device_compute_ms = 0;
  double total_device_to_device_ms = 0;
+  double total_device_collectives_ms = 0;
  double total_unknown_ms = 0;

  for (const google::protobuf::Any& step_details : any_step_details) {
@ -677,6 +720,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
    total_host_prepare_ms += details.host_prepare_ms();
    total_device_compute_ms += details.device_compute_ms();
    total_device_to_device_ms += details.device_to_device_ms();
+    total_device_collectives_ms += details.device_collectives_ms();
    total_host_compute_ms += details.host_compute_ms();
    total_host_compile_ms += details.host_compile_ms();
    total_unknown_ms += details.unknown_time_ms();
@ -692,24 +736,37 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
    analysis.set_kernel_launch_statement("");
    analysis.set_all_other_classification("no");
    analysis.set_all_other_statement("");
+    analysis.set_device_collectives_classification("no");
+    analysis.set_device_collectives_statement("");
    return analysis;
  }
  double input_percent = 100.0 * total_input_ms / total_step_time_ms;
  double output_percent = 100.0 * total_output_ms / total_step_time_ms;
  double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
+  double device_collectives_percent =
+      100.0 * total_device_collectives_ms / total_step_time_ms;
+
  // idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
  // host compute, host compile, and unknown.
  double idle_percent =
-      std::max(0.0, 100.0 - input_percent - output_percent - compute_percent);
+      std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
+                        device_collectives_percent);
  double kernel_launch_percent =
      100.0 * total_host_prepare_ms / total_step_time_ms;
  double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
+
  std::string input_classification;
  std::string input_statement;
  bool all_other_reported =
      InputAnalysis(input_percent, all_other_percent, &input_classification,
                    &input_statement);

+  std::string device_collectives_classification;
+  std::string device_collectives_statement;
+  DeviceCollectivesAnalysis(device_collectives_percent,
+                            &device_collectives_classification,
+                            &device_collectives_statement);
+
  std::string kernel_launch_classification;
  std::string kernel_launch_statement;
  KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
@ -732,6 +789,10 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
  analysis.set_kernel_launch_statement(kernel_launch_statement);
  analysis.set_all_other_classification(all_other_classification);
  analysis.set_all_other_statement(all_other_statement);
+  analysis.set_device_collectives_classification(
+      device_collectives_classification);
+  analysis.set_device_collectives_statement(device_collectives_statement);
+
  return analysis;
 }

--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@ -154,6 +154,10 @@ OverviewPageRecommendation ComputeGenericRecommendation(
    const PrecisionStats& precision_stats) {
  OverviewPageRecommendation re;
  GenericRecommendation generic;
+  generic.set_device_collectives_bottleneck(
+      bottleneck.device_collectives_classification());
+  generic.set_device_collectives_statement(
+      bottleneck.device_collectives_statement());
  generic.set_kernel_launch_bottleneck(
      bottleneck.kernel_launch_classification());
  generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement());
--- a/tensorflow/core/profiler/protobuf/input_pipeline.proto
+++ b/tensorflow/core/profiler/protobuf/input_pipeline.proto
@ -30,6 +30,12 @@ message BottleneckAnalysis {
  string all_other_classification = 5;
  // A human-readable description of the all other overhead.
  string all_other_statement = 6;
+  // Indicates if device collective communication is a bottleneck. Possible
+  // values: "no", "moderate", "high".
+  string device_collectives_classification = 11;
+  // A human-readable description of the device collective communication
+  // overhead.
+  string device_collectives_statement = 12;
 }

 // Used for both step duration and Op duration.
@ -60,6 +66,8 @@ message PerGenericStepDetails {
  double device_compute_ms = 6;
  // The device-to-device communication time (in ms).
  double device_to_device_ms = 7;
+  // The device time spent on collective communications (in ms).
+  double device_collectives_ms = 13;
  // The host-compute time (in ms).
  double host_compute_ms = 8;
  // The host-prepare time (in ms).
@ -129,6 +137,8 @@ message GenericStepTimeBreakdown {
  StepSummary device_compute_ms_summary = 4;
  // Summary of all device-to-device time as a part of step in ms.
  StepSummary device_to_device_ms_summary = 5;
+  // Summary of all device-collectives time as a part of step in ms.
+  StepSummary device_collectives_ms_summary = 12;
  // Summary of all host-compute time as a part of step in ms.
  StepSummary host_compute_ms_summary = 6;
  // Summary of all host-prepare time as a part of step in ms.
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@ -87,6 +87,12 @@ message GenericRecommendation {
  // A statement that recommends if the user should try using lower precision.
  // Shows this statement to users only if it is not empty.
  string precision_statement = 5;
+  // Indicates if device collectives are a performance bottleneck. Possible
+  // values: "no", "moderate", "high".
+  string device_collectives_bottleneck = 6;
+  // A statement that recommends if we need to further investigate
+  // device-collectives performance.
+  string device_collectives_statement = 7;
 }

 // Overview result for the recommendation section.
--- a/tensorflow/core/profiler/utils/event_span.h
+++ b/tensorflow/core/profiler/utils/event_span.h
@ -46,14 +46,17 @@ enum EventType {
  HOST_TO_DEVICE = 40,
  // Host is preparing to launch a computation on device.
  HOST_PREPARE = 50,
-  // Host is waiting for input.
-  HOST_WAIT_INPUT = 60,
-  // Device-to-device communication.
-  DEVICE_TO_DEVICE = 70,
-  // Device-to-host communication.
-  DEVICE_TO_HOST = 80,
+  // Assigns a smaller priority to DEVICE_COLLECTIVES than HOST_WAIT_INPUT,
+  // because if an all-reduce event is overlapped with an host-wait-input event,
+  // we want to count it as waiting for input.
  // Collective Ops such as All-Reduce.
-  DEVICE_COLLECTIVES = 90,
+  DEVICE_COLLECTIVES = 60,
+  // Host is waiting for input.
+  HOST_WAIT_INPUT = 70,
+  // Device-to-device communication.
+  DEVICE_TO_DEVICE = 80,
+  // Device-to-host communication.
+  DEVICE_TO_HOST = 90,
  // Device is computing with 32-bit precision.
  DEVICE_COMPUTE_32 = 100,
  // Device is computing with 16-bit precision.