From 99eb2266550b09a647c477fe0c85a12984949616 Mon Sep 17 00:00:00 2001
From: "A. Unique TensorFlower" <gardener@tensorflow.org>
Date: Tue, 14 Jan 2020 14:46:15 -0800
Subject: [PATCH] Added RunEnvironment to OpStats. Added a converter from
 OpStats to OverviewPage. Added input and bottleneck analysis to
 InputPipelineAnalysis.

PiperOrigin-RevId: 289735025
Change-Id: Ice4b2db5f241573afecce52aa882216ea16bd74c
---
 tensorflow/core/profiler/convert/BUILD        |  23 +++
 .../op_stats_to_input_pipeline_analysis.cc    | 163 ++++++++++++++++++
 .../op_stats_to_input_pipeline_analysis.h     |  37 +++-
 .../convert/op_stats_to_overview_page.cc      | 160 +++++++++++++++++
 .../convert/op_stats_to_overview_page.h       |  45 +++++
 tensorflow/core/profiler/protobuf/BUILD       |   5 +-
 .../core/profiler/protobuf/op_stats.proto     |   8 +-
 .../profiler/protobuf/overview_page.proto     |  57 +-----
 8 files changed, 438 insertions(+), 60 deletions(-)
 create mode 100644 tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
 create mode 100644 tensorflow/core/profiler/convert/op_stats_to_overview_page.h

diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index f6f1d589c0d..c41fa2dbeda 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -66,6 +66,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "op_stats_to_overview_page",
+    srcs = ["op_stats_to_overview_page.cc"],
+    hdrs = ["op_stats_to_overview_page.h"],
+    deps = [
+        ":op_metrics_to_record",
+        ":op_stats_to_input_pipeline_analysis",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/platform:logging",
+        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
+        "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
+        "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
+        "//tensorflow/core/profiler/utils:math_utils",
+        "//tensorflow/core/profiler/utils:op_metrics_db_utils",
+        "//tensorflow/core/profiler/utils:time_utils",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 cc_library(
     name = "op_stats_to_input_pipeline_analysis",
     srcs = ["op_stats_to_input_pipeline_analysis.cc"],
@@ -88,6 +110,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 05c7ab5ebf9..be1a24b1412 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -46,6 +47,28 @@ namespace {
 
 const double kNumPsPerMs = 1000000000.0;
 
+// If the percentage of step time that is due to infeed is less than
+// kModeratelyInfeedBoundThresholdInPercent, it is considered NOT
+// input-bound; else if it is less than
+// kHighlyInfeedBoundThresholdInPercent, it is considered MODERATELY
+// input-bound; else if it is considered HIGHLY input-bound.
+constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
+constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
+// If the percentage of step time that is due to kernel launch is less than
+// kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
+// kernel-launch bound; else if it is less than
+// kHighlyKernelLaunchBoundThresholdInPercent, it is considered MODERATELY
+// kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
+constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
+constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
+// If the percentage of step time that is due to all other time is less than
+// kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
+// all-other bound; else if it is less than
+// kHighlyAllOtherBoundThresholdInPercent, it is considered MODERATELY
+// all-other bound; else if it is considered HIGHLY all-other bound.
+constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
+constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
+
 template <class Collection>
 double GetTimeInMs(const Collection& type_ps, EventType event_type) {
   return PicosToMillis(gtl::FindWithDefault(type_ps, event_type, /*value=*/0));
@@ -317,6 +340,47 @@ double RatioOfHostToDeviceTimeToStepTime(
   return 0.0;
 }
 
+void KernelLaunchAnalysis(double kernel_launch_percent, int* observation_index,
+                          string* kernel_launch_classification,
+                          string* kernel_launch_statement) {
+  string percent_str = absl::StrFormat("%.1lf", kernel_launch_percent);
+  if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
+    *kernel_launch_classification = "high";
+    *kernel_launch_statement = absl::StrCat(
+        "(", ++*observation_index, ") ", percent_str,
+        " % of the total step time sampled is spent on Kernel Launch.");
+  } else if (kernel_launch_percent >=
+             kModeratelyKernelLaunchBoundThresholdInPercent) {
+    *kernel_launch_classification = "moderate";
+    *kernel_launch_statement = absl::StrCat(
+        "(", ++*observation_index, ") ", percent_str,
+        " % of the total step time sampled is spent on Kernel Launch.");
+  } else {
+    *kernel_launch_classification = "no";
+    *kernel_launch_statement = "";
+  }
+}
+
+void AllOtherAnalysis(double all_other_percent, int* observation_index,
+                      string* all_other_classification,
+                      string* all_other_statement) {
+  string percent_str = absl::StrFormat("%.1lf", all_other_percent);
+  if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
+    *all_other_classification = "high";
+    *all_other_statement = absl::StrCat(
+        "(", ++*observation_index, ") ", percent_str,
+        " % of the total step time sampled is spent on All Others time.");
+  } else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
+    *all_other_classification = "moderate";
+    *all_other_statement = absl::StrCat(
+        "(", ++*observation_index, ") ", percent_str,
+        " % of the total step time sampled is spent on All Others time.");
+  } else {
+    *all_other_classification = "no";
+    *all_other_statement = "";
+  }
+}
+
 }  // namespace
 
 void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
@@ -451,5 +515,104 @@ InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
   return result;
 }
 
+void InfeedAnalysis(HardwareType hardware_type, double infeed_percent,
+                    int* observation_index, string* input_classification,
+                    string* input_statement) {
+  absl::string_view non_input_time = "other time";
+  string infeed_percent_str = absl::StrFormat("%.1lf", infeed_percent);
+  if (infeed_percent >= kHighlyInfeedBoundThresholdInPercent) {
+    *input_classification = "host";
+    *input_statement = absl::StrCat(
+        "(", ++*observation_index, ") ",
+        "Your program is HIGHLY input-bound because ", infeed_percent_str,
+        "% of the total step time sampled is waiting for input. Therefore, "
+        "you should first focus on reducing the input time.");
+  } else if (infeed_percent >= kModeratelyInfeedBoundThresholdInPercent) {
+    *input_classification = "both";
+    *input_statement = absl::StrCat(
+        "(", ++*observation_index, ") ",
+        "Your program is MODERATELY input-bound because ", infeed_percent_str,
+        "% of the total step time sampled is waiting for input. Therefore, "
+        "you would need to reduce both the input time and ",
+        non_input_time, ".");
+  } else {
+    *input_classification = "device";
+    *input_statement = absl::StrCat(
+        "(", ++*observation_index, ") ",
+        "Your program is NOT input-bound because only ", infeed_percent_str,
+        "% of the total step time sampled is waiting for "
+        "input. Therefore, you should focus on "
+        "reducing ",
+        non_input_time, ".");
+  }
+}
+
+GenericBottleneck GenericOverallBottleneck(
+    const InputPipelineAnalysisResult& result) {
+  double total_step_time_ms = 0;
+  double total_input_ms = 0;
+  double total_output_ms = 0;
+  double total_host_compute_ms = 0;
+  double total_host_prepare_ms = 0;
+  double total_host_compile_ms = 0;
+  double total_device_to_device_ms = 0;
+  double total_unknown_ms = 0;
+  for (const google::protobuf::Any& step_details : result.step_details()) {
+    PerGenericStepDetails details;
+    bool success = step_details.UnpackTo(&details);
+    if (!success && !step_details.type_url().empty()) {
+      LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
+                 << std::endl;
+      return {};
+    }
+    total_step_time_ms += details.step_time_ms();
+    total_input_ms +=
+        details.host_wait_input_ms() + details.host_to_device_ms();
+    total_output_ms += details.output_ms();
+    total_host_prepare_ms += details.host_prepare_ms();
+    total_device_to_device_ms += details.device_to_device_ms();
+    total_host_compute_ms += details.host_compute_ms();
+    total_host_compile_ms += details.host_compile_ms();
+    total_unknown_ms += details.unknown_time_ms();
+  }
+  if (total_step_time_ms == 0) {
+    return {{"unknown",
+             "No step time measured. Therefore we cannot tell where the "
+             "performance bottleneck is."},
+            "no",
+            "",
+            "no",
+            ""};
+  }
+  double input_percent = 100.0 * total_input_ms / total_step_time_ms;
+  double kernel_launch_percent =
+      100.0 * total_host_prepare_ms / total_step_time_ms;
+  double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
+  int observation_index = 0;
+  string input_classification;
+  string input_statement;
+  InfeedAnalysis(result.hardware_type(), input_percent, &observation_index,
+                 &input_classification, &input_statement);
+
+  string kernel_launch_classification;
+  string kernel_launch_statement;
+  KernelLaunchAnalysis(kernel_launch_percent, &observation_index,
+                       &kernel_launch_classification, &kernel_launch_statement);
+
+  string all_other_classification;
+  string all_other_statement;
+  AllOtherAnalysis(all_other_percent, &observation_index,
+                   &all_other_classification, &all_other_statement);
+
+  return {{
+              input_classification,
+              input_statement,
+          },
+          kernel_launch_classification,
+          kernel_launch_statement,
+          all_other_classification,
+          all_other_statement};
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
index aaf47b9595d..e3f40daf106 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h
@@ -25,8 +25,30 @@ limitations under the License.
 namespace tensorflow {
 namespace profiler {
 
-InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
-    const OpStats& op_stats, const HardwareType& hardware_type);
+// Common performance bottleneck.
+struct CommonBottleneck {
+  // Indicates if input is a bottleneck. Possible values:  "host", "device",
+  // "both", or "unknown"
+  string input_classification;
+  // A human-readable description of the input bottleneck.
+  string input_statement;
+};
+
+// Generic hardware bottleneck.
+struct GenericBottleneck {
+  // Bottleneck that exists on all hardware.
+  CommonBottleneck common;
+  // Indicates if kernel launching is a bottleneck. Possible values: "no",
+  // "moderate", "high".
+  string kernel_launch_classification;
+  // A human-readable description of the kernel launching overhead.
+  string kernel_launch_statement;
+  // Indicates if all other is a bottleneck. Possible values: "no", "moderate",
+  // "high".
+  string all_other_classification;
+  // A human-readable description of the all other overhead.
+  string all_other_statement;
+};
 
 // Computes the summary of step time in milliseconds.
 StepSummary ComputeStepTimeSummaryInMs(
@@ -38,6 +60,17 @@ void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
 
 InputPipelineAnalysisRecommendation GenerateRecommendation();
 
+// Returns the performance bottleneck of the program executed.
+GenericBottleneck GenericOverallBottleneck(
+    const InputPipelineAnalysisResult& result);
+
+InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
+    const OpStats& op_stats, const HardwareType& hardware_type);
+
+void InfeedAnalysis(HardwareType hardware_type, double infeed_percent,
+                    int* observation_index, string* input_classification,
+                    string* input_statement);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
new file mode 100644
index 00000000000..367d7593f7c
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -0,0 +1,160 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
+
+#include <algorithm>
+#include <utility>
+
+#include "google/protobuf/any.pb.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
+#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
+#include "tensorflow/core/profiler/utils/math_utils.h"
+#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
+#include "tensorflow/core/profiler/utils/time_utils.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+OverviewPageTip MakeOverviewPageTip(const string& text) {
+  OverviewPageTip tip;
+  tip.set_link(text);
+  return tip;
+}
+
+string AnchorElement(const string& url, const string& text) {
+  return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
+}
+
+// Makes a recommendation for looking up a document.
+// doc_url is expected to be already be escaped suitably for use in an HTML
+// attribute.
+OverviewPageTip MakeOverviewPageTipDocLink(const string& doc_url,
+                                           const string& text) {
+  OverviewPageTip tip;
+  tip.set_link(AnchorElement(doc_url, text));
+  return tip;
+}
+
+void ComputeHostTips(OverviewPageRecommendation* re) {
+  *re->add_host_tips() = MakeOverviewPageTip(
+      "input_pipeline_analyzer (especially Section 3 for the breakdown of "
+      "input operations on the Host)");
+  *re->add_host_tips() = MakeOverviewPageTip(
+      "trace_viewer (look at the activities on the timeline of each Host "
+      "Thread near the bottom of the trace view)");
+}
+
+void ComputeDeviceTips(HardwareType hardware_type,
+                       OverviewPageRecommendation* re) {
+  const string& device_name = HardwareType_Name(hardware_type);
+  string timeline_name =
+      (hardware_type == tensorflow::profiler::TPU) ? "TPU core" : device_name;
+  *re->add_device_tips() = MakeOverviewPageTip(absl::StrCat(
+      "op_profile (identify the time-consuming operations executed on the ",
+      device_name, ")"));
+  *re->add_device_tips() = MakeOverviewPageTip(absl::StrCat(
+      "trace_viewer (look at the activities on the timeline of each ",
+      timeline_name, " in the trace view)"));
+}
+
+void ComputeFaqTips(OverviewPageRecommendation* re) {
+  *re->add_faq_tips() = MakeOverviewPageTip("Refer to the Cloud tools FAQ");
+}
+
+void ComputeDocumentationTips(OverviewPageRecommendation* re) {
+  *re->add_documentation_tips() = MakeOverviewPageTipDocLink(
+      "https://www.tensorflow.org/versions/master/api_docs/python/tf/data/"
+      "Dataset",
+      "TensorFlow Input Pipeline API");
+}
+
+}  // namespace
+
+void SetCommonRecommendation(const CommonBottleneck& bottleneck,
+                             HardwareType hardware_type,
+                             OverviewPageRecommendation* re) {
+  re->set_bottleneck(bottleneck.input_classification);
+  re->set_statement(bottleneck.input_statement);
+  ComputeHostTips(re);
+  ComputeDeviceTips(hardware_type, re);
+  ComputeDocumentationTips(re);
+  ComputeFaqTips(re);
+}
+
+OverviewPageRecommendation ComputeGenericRecommendation(
+    const GenericBottleneck& bottleneck) {
+  OverviewPageRecommendation re;
+  GenericRecommendation generic;
+  generic.set_kernel_launch_bottleneck(bottleneck.kernel_launch_classification);
+  generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement);
+  generic.set_all_other_bottleneck(bottleneck.all_other_classification);
+  generic.set_all_other_statement(bottleneck.all_other_statement);
+  re.mutable_recommendation()->PackFrom(generic);
+  return re;
+}
+
+OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
+  OverviewPageAnalysis analysis;
+  OpMetricsDb metrics_db =
+      CreateTfMetricsDbFromHloMetricsDb(op_stats.device_op_metrics_db());
+  uint64 total_device_time_ps = metrics_db.total_time_ps();
+  constexpr int kNumTopOpsShown = 10;
+  double device_cumulative_fraction = 0.0;
+  for (const OpMetrics* metrics :
+       SortedOpMetricsDb(metrics_db, kNumTopOpsShown)) {
+    OverviewTfOp* op = analysis.add_top_device_ops();
+    op->set_name(metrics->name());
+    op->set_category(metrics->category());
+    op->set_self_time_fraction(
+        SafeDivide(metrics->self_time_ps(), total_device_time_ps));
+    device_cumulative_fraction += op->self_time_fraction();
+    op->set_cumulative_time_fraction(device_cumulative_fraction);
+    op->set_flop_rate(
+        SafeDivide(metrics->flops(), PicosToNanos(metrics->time_ps())));
+  }
+  return analysis;
+}
+
+OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
+                                          HardwareType hardware_type) {
+  OverviewPageAnalysis analysis = ComputeAnalysisResult(op_stats);
+  InputPipelineAnalysisResult input_analysis =
+      ConvertOpStatsToInputPipelineAnalysis(op_stats, hardware_type);
+  GenericBottleneck bottleneck = GenericOverallBottleneck(input_analysis);
+  OverviewPageRecommendation recommendation =
+      ComputeGenericRecommendation(bottleneck);
+  SetCommonRecommendation(bottleneck.common, hardware_type, &recommendation);
+
+  OverviewPage overview_page;
+  *overview_page.mutable_run_environment() = op_stats.run_environment();
+  *overview_page.mutable_analysis() = analysis;
+  *overview_page.mutable_input_analysis() = input_analysis;
+  *overview_page.mutable_recommendation() = recommendation;
+  return overview_page;
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
new file mode 100644
index 00000000000..875f08aa956
--- /dev/null
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -0,0 +1,45 @@
+/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/protobuf.h"
+#include "tensorflow/core/platform/types.h"
+#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
+#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+void SetCommonRecommendation(const CommonBottleneck& bottleneck,
+                             HardwareType hardware_type,
+                             OverviewPageRecommendation* re);
+
+OverviewPageRecommendation ComputeGenericRecommendation(
+    const GenericBottleneck& bottleneck);
+
+OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats);
+
+OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
+                                          HardwareType hardware_type);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
diff --git a/tensorflow/core/profiler/protobuf/BUILD b/tensorflow/core/profiler/protobuf/BUILD
index ecf6d2b26ae..cdbf0e605da 100644
--- a/tensorflow/core/profiler/protobuf/BUILD
+++ b/tensorflow/core/profiler/protobuf/BUILD
@@ -40,7 +40,10 @@ tf_proto_library(
     name = "overview_page_proto",
     srcs = ["overview_page.proto"],
     cc_api_version = 2,
-    protodeps = [":input_pipeline_proto"],
+    protodeps = [
+        ":input_pipeline_proto",
+        ":op_stats_proto",
+    ],
     visibility = [
         ":friends",
     ],
diff --git a/tensorflow/core/profiler/protobuf/op_stats.proto b/tensorflow/core/profiler/protobuf/op_stats.proto
index a48b66204be..a3926bea7b5 100644
--- a/tensorflow/core/profiler/protobuf/op_stats.proto
+++ b/tensorflow/core/profiler/protobuf/op_stats.proto
@@ -54,7 +54,7 @@ message SystemTopology {
   int64 num_expected_reduced_chips = 4;
 }
 
-// Result proto for RunEnvironment (the run environment of a profiling session).
+// The run environment of a profiling session.
 message RunEnvironment {
   // Number of hosts used.
   int32 host_count = 1;
@@ -71,9 +71,9 @@ message RunEnvironment {
   int32 device_core_count = 5;
   // The per-device-core batch size.
   int32 per_core_batch_size = 6;
-  // Host-independent job information.
+  // Host-independent information about this job.
   HostIndependentJobInfoResult host_independent_job_info = 7;
-  // Host-dependent job information.
+  // Host-dependent information about this job.
   repeated HostDependentJobInfoResult host_dependent_job_info = 8;
   // The number of replicas, corresponds to input parallelism.
   // If there is no model parallelism, replica_count = device_core_count
@@ -97,4 +97,6 @@ message OpStats {
   PerfEnv perf_env = 3;
   // The database of step sequences.
   StepDatabaseResult step_db = 4;
+  // The run environment of this profiling session.
+  RunEnvironment run_environment = 5;
 }
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index c7fc6c8936b..18512cac879 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -4,59 +4,7 @@ package tensorflow.profiler;
 
 import "google/protobuf/any.proto";
 import "tensorflow/core/profiler/protobuf/input_pipeline.proto";
-
-// Overview result for host-independent job information.
-message OverviewPageHostIndependentJobInfo {
-  // The CL of the build.
-  int64 change_list = 1;
-  // The time of this build (nanoseconds since the Unix epoch).
-  int64 build_time = 2;
-  // The target of this build.
-  string build_target = 3;
-  // Profiling duration (in ms).
-  uint32 profile_duration_ms = 4;
-}
-
-// Overview result for host-dependent job information.
-message OverviewPageHostDependentJobInfo {
-  // The ID of the host where this job was run.
-  string host_id = 1;
-  // The command line for this run.
-  string command_line = 2;
-  // The start time of this run (nanoseconds since the Unix epoch).
-  int64 start_time = 3;
-  // BNS address specified by client at time of profiling request.
-  string bns_address = 4;
-  // Profiling start walltime (in ns).
-  uint64 profile_time_ns = 5;
-}
-
-// Overview result for run environment.
-message OverviewPageRunEnvironment {
-  // Number of hosts used.
-  int32 host_count = 1;
-  // Number of tasks used.
-  int32 task_count = 2;
-  // The type of device used.
-  string device_type = 3;
-  // The number of device cores used.
-  // What "device core" means depends on the platform:
-  //   For TPU, a device core is a TPU core.
-  //   For Nvidia GPU, a device core is a GPU (not a SM).
-  int32 device_core_count = 4;
-  // The per-device-core batch size.
-  int32 per_core_batch_size = 5;
-  // Host-independent information about this job.
-  OverviewPageHostIndependentJobInfo host_independent_job_info = 6;
-  // Host-dependent information about this job.
-  repeated OverviewPageHostDependentJobInfo host_dependent_job_info = 7;
-  // The number of replicas, corresponds to input parallelism.
-  // If there is no model parallelism, replica_count = device_core_count
-  int32 replica_count = 8;
-  // The number of cores used for a single replica, e.g. model parallelism.
-  // If there is no model parallelism, then num_cores_per_replica = 1
-  int32 num_cores_per_replica = 9;
-}
+import "tensorflow/core/profiler/protobuf/op_stats.proto";
 
 // Overview result for a TensorFlow Op.
 message OverviewTfOp {
@@ -138,11 +86,12 @@ message OverviewPageRecommendation {
 
 message OverviewPage {
   // The run environment of the profiled session.
-  OverviewPageRunEnvironment run_environment = 1;
+  RunEnvironment run_environment = 5;
   // The step-time result.
   InputPipelineAnalysisResult input_analysis = 2;
   // The other analysis result.
   OverviewPageAnalysis analysis = 3;
   // The recommendation made to the user.
   OverviewPageRecommendation recommendation = 4;
+  reserved 1;
 }