From 0451edab7d57b4a1cebde86518807193d9800525 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Mon, 21 Sep 2020 10:46:25 -0700
Subject: [PATCH] [Profiler] Separate device collective communication time in
 the GPU step-time breakdown.

PiperOrigin-RevId: 332884401
Change-Id: Ia97fb9887d29f5faa2205ee97cfba67de87411b0
---
 .../op_stats_to_input_pipeline_analysis.cc    | 69 +++++++++++++++++--
 .../convert/op_stats_to_overview_page.cc      |  4 ++
 .../profiler/protobuf/input_pipeline.proto    | 10 +++
 .../profiler/protobuf/overview_page.proto     |  6 ++
 tensorflow/core/profiler/utils/event_span.h   | 17 +++--
 5 files changed, 95 insertions(+), 11 deletions(-)
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 6828950e6a5..0aea94b23d4 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -59,6 +59,7 @@ const double kNumPsPerMs = 1000000000.0;
 // input-bound; else if it is considered HIGHLY input-bound.
 constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
 constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
+
 // If the percentage of step time that is due to outfeed is less than
 // kModeratelyOutfeedBoundThresholdInPercent, it is considered NOT
 // output-bound; else if it is less than
@@ -66,6 +67,7 @@ constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
 // output-bound; else if it is considered HIGHLY output-bound.
 constexpr double kModeratelyOutfeedBoundThresholdInPercent = 5;
 constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
+
 // If the percentage of step time that is due to kernel launch is less than
 // kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
 // kernel-launch bound; else if it is less than
@@ -73,6 +75,7 @@ constexpr double kHighlyOutfeedBoundThresholdInPercent = 20;
 // kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
 constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
 constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
+
 // If the percentage of step time that is due to all other time is less than
 // kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
 // all-other bound; else if it is less than
@@ -80,6 +83,16 @@ constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
 // all-other bound; else if it is considered HIGHLY all-other bound.
 constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
 constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
+
+// If the percentage of step time that is due to device collectives is less than
+// kModeratelyDeviceCollectivesBoundThresholdInPercent, it is considered NOT
+// device-collectives bound; else if it is less than
+// kHighlyDeviceCollectivesBoundThresholdInPercent, it is considered MODERATELY
+// device-collectives  bound; else if it is considered HIGHLY device-collectives
+// bound.
+constexpr double kModeratelyDeviceCollectivesBoundThresholdInPercent = 3;
+constexpr double kHighlyDeviceCollectivesBoundThresholdInPercent = 15;
+
 // Section number of the host-analysis section in the input-pipeline analysis.
 constexpr int kHostAnalysisSectionNumber = 3;
 // Python-only explanation for "All Others" time.
@@ -125,6 +138,7 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
   Stat<double> output_ms;
   Stat<double> device_compute_ms;
   Stat<double> device_to_device_ms;
+  Stat<double> device_collectives_ms;
   Stat<double> host_compute_ms;
   Stat<double> host_prepare_ms;
   Stat<double> host_compile_ms;
@@ -146,6 +160,7 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
     output_ms.UpdateStat(details.output_ms());
     device_compute_ms.UpdateStat(details.device_compute_ms());
     device_to_device_ms.UpdateStat(details.device_to_device_ms());
+    device_collectives_ms.UpdateStat(details.device_collectives_ms());
     host_compute_ms.UpdateStat(details.host_compute_ms());
     host_prepare_ms.UpdateStat(details.host_prepare_ms());
     host_compile_ms.UpdateStat(details.host_compile_ms());
@@ -162,6 +177,8 @@ GenericStepTimeBreakdown ComputeGenericStepTimeBreakdownInMs(
       GetStepSummaryForSampleStats(device_compute_ms);
   *result.mutable_device_to_device_ms_summary() =
       GetStepSummaryForSampleStats(device_to_device_ms);
+  *result.mutable_device_collectives_ms_summary() =
+      GetStepSummaryForSampleStats(device_collectives_ms);
   *result.mutable_host_compute_ms_summary() =
       GetStepSummaryForSampleStats(host_compute_ms);
   *result.mutable_host_prepare_ms_summary() =
@@ -208,14 +225,13 @@ InputPipelineAnalysisResult ComputeGenericInputPipelineAnalysisResult(
                                   GetTimeInMs(type_ps, DEVICE_WAIT_HOST));
     details.set_output_ms(GetTimeInMs(type_ps, DEVICE_TO_HOST));
     details.set_device_compute_ms(GetTimeInMs(type_ps, DEVICE_COMPUTE_16) +
-                                  GetTimeInMs(type_ps, DEVICE_COMPUTE_32) +
-                                  GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
+                                  GetTimeInMs(type_ps, DEVICE_COMPUTE_32));
     details.set_device_to_device_ms(GetTimeInMs(type_ps, DEVICE_TO_DEVICE) +
                                     GetTimeInMs(type_ps, DEVICE_WAIT_DEVICE));
+    details.set_device_collectives_ms(GetTimeInMs(type_ps, DEVICE_COLLECTIVES));
     details.set_host_compute_ms(GetTimeInMs(type_ps, HOST_COMPUTE));
     details.set_host_prepare_ms(GetTimeInMs(type_ps, HOST_PREPARE));
     details.set_host_compile_ms(GetTimeInMs(type_ps, HOST_COMPILE));
-
     result.add_step_details()->PackFrom(details);
 
     const double input_percent_of_step_time =
@@ -360,6 +376,32 @@ double RatioOfHostToDeviceTimeToStepTime(
   return 0.0;
 }
 
+void DeviceCollectivesAnalysis(double device_collectives_percent,
+                               std::string* device_collectives_classification,
+                               std::string* device_collectives_statement) {
+  std::string percent_str =
+      absl::StrFormat("%.1lf", device_collectives_percent);
+
+  if (device_collectives_percent >=
+      kHighlyDeviceCollectivesBoundThresholdInPercent) {
+    *device_collectives_classification = "high";
+    *device_collectives_statement =
+        absl::StrCat(percent_str,
+                     " % of the total step time sampled is spent on 'Device "
+                     "Collective Communication'.");
+  } else if (device_collectives_percent >=
+             kModeratelyDeviceCollectivesBoundThresholdInPercent) {
+    *device_collectives_classification = "moderate";
+    *device_collectives_statement =
+        absl::StrCat(percent_str,
+                     " % of the total step time sampled is spent on 'Device "
+                     "Collective Communication'.");
+  } else {
+    *device_collectives_classification = "no";
+    *device_collectives_statement = "";
+  }
+}
+
 void KernelLaunchAnalysis(bool tfdata_used, double kernel_launch_percent,
                           std::string* kernel_launch_classification,
                           std::string* kernel_launch_statement) {
@@ -660,6 +702,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
   double total_host_compile_ms = 0;
   double total_device_compute_ms = 0;
   double total_device_to_device_ms = 0;
+  double total_device_collectives_ms = 0;
   double total_unknown_ms = 0;
 
   for (const google::protobuf::Any& step_details : any_step_details) {
@@ -677,6 +720,7 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
     total_host_prepare_ms += details.host_prepare_ms();
     total_device_compute_ms += details.device_compute_ms();
     total_device_to_device_ms += details.device_to_device_ms();
+    total_device_collectives_ms += details.device_collectives_ms();
     total_host_compute_ms += details.host_compute_ms();
     total_host_compile_ms += details.host_compile_ms();
     total_unknown_ms += details.unknown_time_ms();
@@ -692,24 +736,37 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
     analysis.set_kernel_launch_statement("");
     analysis.set_all_other_classification("no");
     analysis.set_all_other_statement("");
+    analysis.set_device_collectives_classification("no");
+    analysis.set_device_collectives_statement("");
     return analysis;
   }
   double input_percent = 100.0 * total_input_ms / total_step_time_ms;
   double output_percent = 100.0 * total_output_ms / total_step_time_ms;
   double compute_percent = 100.0 * total_device_compute_ms / total_step_time_ms;
+  double device_collectives_percent =
+      100.0 * total_device_collectives_ms / total_step_time_ms;
+
   // idle_percent includes host_prepare (i.e. kernel launch, device-to-device,
   // host compute, host compile, and unknown.
   double idle_percent =
-      std::max(0.0, 100.0 - input_percent - output_percent - compute_percent);
+      std::max(0.0, 100.0 - input_percent - output_percent - compute_percent -
+                        device_collectives_percent);
   double kernel_launch_percent =
       100.0 * total_host_prepare_ms / total_step_time_ms;
   double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
+
   std::string input_classification;
   std::string input_statement;
   bool all_other_reported =
       InputAnalysis(input_percent, all_other_percent, &input_classification,
                     &input_statement);
 
+  std::string device_collectives_classification;
+  std::string device_collectives_statement;
+  DeviceCollectivesAnalysis(device_collectives_percent,
+                            &device_collectives_classification,
+                            &device_collectives_statement);
+
   std::string kernel_launch_classification;
   std::string kernel_launch_statement;
   KernelLaunchAnalysis(TfDataInUse(input_time_breakdown), kernel_launch_percent,
@@ -732,6 +789,10 @@ BottleneckAnalysis ComputeBottleneckAnalysis(
   analysis.set_kernel_launch_statement(kernel_launch_statement);
   analysis.set_all_other_classification(all_other_classification);
   analysis.set_all_other_statement(all_other_statement);
+  analysis.set_device_collectives_classification(
+      device_collectives_classification);
+  analysis.set_device_collectives_statement(device_collectives_statement);
+
   return analysis;
 }
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 8f58b7bf3ae..8f0e920c7e6 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -154,6 +154,10 @@ OverviewPageRecommendation ComputeGenericRecommendation(
     const PrecisionStats& precision_stats) {
   OverviewPageRecommendation re;
   GenericRecommendation generic;
+  generic.set_device_collectives_bottleneck(
+      bottleneck.device_collectives_classification());
+  generic.set_device_collectives_statement(
+      bottleneck.device_collectives_statement());
   generic.set_kernel_launch_bottleneck(
       bottleneck.kernel_launch_classification());
   generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement());
diff --git a/tensorflow/core/profiler/protobuf/input_pipeline.proto b/tensorflow/core/profiler/protobuf/input_pipeline.proto
index b20942d3d36..e64470558ad 100644
--- a/tensorflow/core/profiler/protobuf/input_pipeline.proto
+++ b/tensorflow/core/profiler/protobuf/input_pipeline.proto
@@ -30,6 +30,12 @@ message BottleneckAnalysis {
   string all_other_classification = 5;
   // A human-readable description of the all other overhead.
   string all_other_statement = 6;
+  // Indicates if device collective communication is a bottleneck. Possible
+  // values: "no", "moderate", "high".
+  string device_collectives_classification = 11;
+  // A human-readable description of the device collective communication
+  // overhead.
+  string device_collectives_statement = 12;
 }
 
 // Used for both step duration and Op duration.
@@ -60,6 +66,8 @@ message PerGenericStepDetails {
   double device_compute_ms = 6;
   // The device-to-device communication time (in ms).
   double device_to_device_ms = 7;
+  // The device time spent on collective communications (in ms).
+  double device_collectives_ms = 13;
   // The host-compute time (in ms).
   double host_compute_ms = 8;
   // The host-prepare time (in ms).
@@ -129,6 +137,8 @@ message GenericStepTimeBreakdown {
   StepSummary device_compute_ms_summary = 4;
   // Summary of all device-to-device time as a part of step in ms.
   StepSummary device_to_device_ms_summary = 5;
+  // Summary of all device-collectives time as a part of step in ms.
+  StepSummary device_collectives_ms_summary = 12;
   // Summary of all host-compute time as a part of step in ms.
   StepSummary host_compute_ms_summary = 6;
   // Summary of all host-prepare time as a part of step in ms.
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index 433f8df27a6..6eb8efe8afa 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -87,6 +87,12 @@ message GenericRecommendation {
   // A statement that recommends if the user should try using lower precision.
   // Shows this statement to users only if it is not empty.
   string precision_statement = 5;
+  // Indicates if device collectives are a performance bottleneck. Possible
+  // values: "no", "moderate", "high".
+  string device_collectives_bottleneck = 6;
+  // A statement that recommends if we need to further investigate
+  // device-collectives performance.
+  string device_collectives_statement = 7;
 }
 
 // Overview result for the recommendation section.
diff --git a/tensorflow/core/profiler/utils/event_span.h b/tensorflow/core/profiler/utils/event_span.h
index 6ffbd228d5e..e2c8f99a2e7 100644
--- a/tensorflow/core/profiler/utils/event_span.h
+++ b/tensorflow/core/profiler/utils/event_span.h
@@ -46,14 +46,17 @@ enum EventType {
   HOST_TO_DEVICE = 40,
   // Host is preparing to launch a computation on device.
   HOST_PREPARE = 50,
-  // Host is waiting for input.
-  HOST_WAIT_INPUT = 60,
-  // Device-to-device communication.
-  DEVICE_TO_DEVICE = 70,
-  // Device-to-host communication.
-  DEVICE_TO_HOST = 80,
+  // Assigns a smaller priority to DEVICE_COLLECTIVES than HOST_WAIT_INPUT,
+  // because if an all-reduce event is overlapped with an host-wait-input event,
+  // we want to count it as waiting for input.
   // Collective Ops such as All-Reduce.
-  DEVICE_COLLECTIVES = 90,
+  DEVICE_COLLECTIVES = 60,
+  // Host is waiting for input.
+  HOST_WAIT_INPUT = 70,
+  // Device-to-device communication.
+  DEVICE_TO_DEVICE = 80,
+  // Device-to-host communication.
+  DEVICE_TO_HOST = 90,
   // Device is computing with 32-bit precision.
   DEVICE_COMPUTE_32 = 100,
   // Device is computing with 16-bit precision.