Added RunEnvironment to OpStats.

Added a converter from OpStats to OverviewPage.
Added input and bottleneck analysis to InputPipelineAnalysis.

PiperOrigin-RevId: 289735025
Change-Id: Ice4b2db5f241573afecce52aa882216ea16bd74c
This commit is contained in:
A. Unique TensorFlower 2020-01-14 14:46:15 -08:00 committed by TensorFlower Gardener
parent 230ebd5d96
commit 99eb226655
8 changed files with 438 additions and 60 deletions

View File

@ -66,6 +66,28 @@ cc_library(
],
)
cc_library(
name = "op_stats_to_overview_page",
srcs = ["op_stats_to_overview_page.cc"],
hdrs = ["op_stats_to_overview_page.h"],
deps = [
":op_metrics_to_record",
":op_stats_to_input_pipeline_analysis",
"//tensorflow/core:lib",
"//tensorflow/core:lib_internal",
"//tensorflow/core/platform:logging",
"//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
"//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
"//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
"//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
"//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
"//tensorflow/core/profiler/utils:math_utils",
"//tensorflow/core/profiler/utils:op_metrics_db_utils",
"//tensorflow/core/profiler/utils:time_utils",
"@com_google_absl//absl/strings",
],
)
cc_library(
name = "op_stats_to_input_pipeline_analysis",
srcs = ["op_stats_to_input_pipeline_analysis.cc"],
@ -88,6 +110,7 @@ cc_library(
"@com_google_absl//absl/algorithm:container",
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
],
)

View File

@ -23,6 +23,7 @@ limitations under the License.
#include "absl/container/flat_hash_map.h"
#include "absl/strings/match.h"
#include "absl/strings/str_cat.h"
#include "absl/strings/str_format.h"
#include "absl/strings/string_view.h"
#include "tensorflow/core/lib/gtl/map_util.h"
#include "tensorflow/core/platform/logging.h"
@ -46,6 +47,28 @@ namespace {
const double kNumPsPerMs = 1000000000.0;
// If the percentage of step time that is due to infeed is less than
// kModeratelyInfeedBoundThresholdInPercent, it is considered NOT
// input-bound; else if it is less than
// kHighlyInfeedBoundThresholdInPercent, it is considered MODERATELY
// input-bound; else if it is considered HIGHLY input-bound.
constexpr double kModeratelyInfeedBoundThresholdInPercent = 5;
constexpr double kHighlyInfeedBoundThresholdInPercent = 20;
// If the percentage of step time that is due to kernel launch is less than
// kModeratelyKernelLaunchBoundThresholdInPercent, it is considered NOT
// kernel-launch bound; else if it is less than
// kHighlyKernelLaunchBoundThresholdInPercent, it is considered MODERATELY
// kernel-launch bound; else if it is considered HIGHLY kernel-launch bound.
constexpr double kModeratelyKernelLaunchBoundThresholdInPercent = 3;
constexpr double kHighlyKernelLaunchBoundThresholdInPercent = 15;
// If the percentage of step time that is due to all other time is less than
// kModeratelyAllOtherBoundThresholdInPercent, it is considered NOT
// all-other bound; else if it is less than
// kHighlyAllOtherBoundThresholdInPercent, it is considered MODERATELY
// all-other bound; else if it is considered HIGHLY all-other bound.
constexpr double kModeratelyAllOtherBoundThresholdInPercent = 3;
constexpr double kHighlyAllOtherBoundThresholdInPercent = 15;
template <class Collection>
double GetTimeInMs(const Collection& type_ps, EventType event_type) {
return PicosToMillis(gtl::FindWithDefault(type_ps, event_type, /*value=*/0));
@ -317,6 +340,47 @@ double RatioOfHostToDeviceTimeToStepTime(
return 0.0;
}
void KernelLaunchAnalysis(double kernel_launch_percent, int* observation_index,
string* kernel_launch_classification,
string* kernel_launch_statement) {
string percent_str = absl::StrFormat("%.1lf", kernel_launch_percent);
if (kernel_launch_percent >= kHighlyKernelLaunchBoundThresholdInPercent) {
*kernel_launch_classification = "high";
*kernel_launch_statement = absl::StrCat(
"(", ++*observation_index, ") ", percent_str,
" % of the total step time sampled is spent on Kernel Launch.");
} else if (kernel_launch_percent >=
kModeratelyKernelLaunchBoundThresholdInPercent) {
*kernel_launch_classification = "moderate";
*kernel_launch_statement = absl::StrCat(
"(", ++*observation_index, ") ", percent_str,
" % of the total step time sampled is spent on Kernel Launch.");
} else {
*kernel_launch_classification = "no";
*kernel_launch_statement = "";
}
}
void AllOtherAnalysis(double all_other_percent, int* observation_index,
string* all_other_classification,
string* all_other_statement) {
string percent_str = absl::StrFormat("%.1lf", all_other_percent);
if (all_other_percent >= kHighlyAllOtherBoundThresholdInPercent) {
*all_other_classification = "high";
*all_other_statement = absl::StrCat(
"(", ++*observation_index, ") ", percent_str,
" % of the total step time sampled is spent on All Others time.");
} else if (all_other_percent >= kModeratelyAllOtherBoundThresholdInPercent) {
*all_other_classification = "moderate";
*all_other_statement = absl::StrCat(
"(", ++*observation_index, ") ", percent_str,
" % of the total step time sampled is spent on All Others time.");
} else {
*all_other_classification = "no";
*all_other_statement = "";
}
}
} // namespace
void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
@ -451,5 +515,104 @@ InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
return result;
}
void InfeedAnalysis(HardwareType hardware_type, double infeed_percent,
int* observation_index, string* input_classification,
string* input_statement) {
absl::string_view non_input_time = "other time";
string infeed_percent_str = absl::StrFormat("%.1lf", infeed_percent);
if (infeed_percent >= kHighlyInfeedBoundThresholdInPercent) {
*input_classification = "host";
*input_statement = absl::StrCat(
"(", ++*observation_index, ") ",
"Your program is HIGHLY input-bound because ", infeed_percent_str,
"% of the total step time sampled is waiting for input. Therefore, "
"you should first focus on reducing the input time.");
} else if (infeed_percent >= kModeratelyInfeedBoundThresholdInPercent) {
*input_classification = "both";
*input_statement = absl::StrCat(
"(", ++*observation_index, ") ",
"Your program is MODERATELY input-bound because ", infeed_percent_str,
"% of the total step time sampled is waiting for input. Therefore, "
"you would need to reduce both the input time and ",
non_input_time, ".");
} else {
*input_classification = "device";
*input_statement = absl::StrCat(
"(", ++*observation_index, ") ",
"Your program is NOT input-bound because only ", infeed_percent_str,
"% of the total step time sampled is waiting for "
"input. Therefore, you should focus on "
"reducing ",
non_input_time, ".");
}
}
GenericBottleneck GenericOverallBottleneck(
const InputPipelineAnalysisResult& result) {
double total_step_time_ms = 0;
double total_input_ms = 0;
double total_output_ms = 0;
double total_host_compute_ms = 0;
double total_host_prepare_ms = 0;
double total_host_compile_ms = 0;
double total_device_to_device_ms = 0;
double total_unknown_ms = 0;
for (const google::protobuf::Any& step_details : result.step_details()) {
PerGenericStepDetails details;
bool success = step_details.UnpackTo(&details);
if (!success && !step_details.type_url().empty()) {
LOG(ERROR) << "Unable to unpack step_breakdown. Expected: generic"
<< std::endl;
return {};
}
total_step_time_ms += details.step_time_ms();
total_input_ms +=
details.host_wait_input_ms() + details.host_to_device_ms();
total_output_ms += details.output_ms();
total_host_prepare_ms += details.host_prepare_ms();
total_device_to_device_ms += details.device_to_device_ms();
total_host_compute_ms += details.host_compute_ms();
total_host_compile_ms += details.host_compile_ms();
total_unknown_ms += details.unknown_time_ms();
}
if (total_step_time_ms == 0) {
return {{"unknown",
"No step time measured. Therefore we cannot tell where the "
"performance bottleneck is."},
"no",
"",
"no",
""};
}
double input_percent = 100.0 * total_input_ms / total_step_time_ms;
double kernel_launch_percent =
100.0 * total_host_prepare_ms / total_step_time_ms;
double all_other_percent = 100.0 * total_unknown_ms / total_step_time_ms;
int observation_index = 0;
string input_classification;
string input_statement;
InfeedAnalysis(result.hardware_type(), input_percent, &observation_index,
&input_classification, &input_statement);
string kernel_launch_classification;
string kernel_launch_statement;
KernelLaunchAnalysis(kernel_launch_percent, &observation_index,
&kernel_launch_classification, &kernel_launch_statement);
string all_other_classification;
string all_other_statement;
AllOtherAnalysis(all_other_percent, &observation_index,
&all_other_classification, &all_other_statement);
return {{
input_classification,
input_statement,
},
kernel_launch_classification,
kernel_launch_statement,
all_other_classification,
all_other_statement};
}
} // namespace profiler
} // namespace tensorflow

View File

@ -25,8 +25,30 @@ limitations under the License.
namespace tensorflow {
namespace profiler {
InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
const OpStats& op_stats, const HardwareType& hardware_type);
// Common performance bottleneck.
struct CommonBottleneck {
// Indicates if input is a bottleneck. Possible values: "host", "device",
// "both", or "unknown"
string input_classification;
// A human-readable description of the input bottleneck.
string input_statement;
};
// Generic hardware bottleneck.
struct GenericBottleneck {
// Bottleneck that exists on all hardware.
CommonBottleneck common;
// Indicates if kernel launching is a bottleneck. Possible values: "no",
// "moderate", "high".
string kernel_launch_classification;
// A human-readable description of the kernel launching overhead.
string kernel_launch_statement;
// Indicates if all other is a bottleneck. Possible values: "no", "moderate",
// "high".
string all_other_classification;
// A human-readable description of the all other overhead.
string all_other_statement;
};
// Computes the summary of step time in milliseconds.
StepSummary ComputeStepTimeSummaryInMs(
@ -38,6 +60,17 @@ void GenerateHostResult(const OpMetricsDb& host_tf_metrics_db,
InputPipelineAnalysisRecommendation GenerateRecommendation();
// Returns the performance bottleneck of the program executed.
GenericBottleneck GenericOverallBottleneck(
const InputPipelineAnalysisResult& result);
InputPipelineAnalysisResult ConvertOpStatsToInputPipelineAnalysis(
const OpStats& op_stats, const HardwareType& hardware_type);
void InfeedAnalysis(HardwareType hardware_type, double infeed_percent,
int* observation_index, string* input_classification,
string* input_statement);
} // namespace profiler
} // namespace tensorflow

View File

@ -0,0 +1,160 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
#include <algorithm>
#include <utility>
#include "google/protobuf/any.pb.h"
#include "tensorflow/core/platform/logging.h"
#include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/profiler/convert/op_metrics_to_record.h"
#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
#include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
#include "tensorflow/core/profiler/utils/math_utils.h"
#include "tensorflow/core/profiler/utils/op_metrics_db_utils.h"
#include "tensorflow/core/profiler/utils/time_utils.h"
namespace tensorflow {
namespace profiler {
namespace {
OverviewPageTip MakeOverviewPageTip(const string& text) {
OverviewPageTip tip;
tip.set_link(text);
return tip;
}
string AnchorElement(const string& url, const string& text) {
return absl::StrCat("<a href=\"", url, "\" target=\"_blank\">", text, "</a>");
}
// Makes a recommendation for looking up a document.
// doc_url is expected to be already be escaped suitably for use in an HTML
// attribute.
OverviewPageTip MakeOverviewPageTipDocLink(const string& doc_url,
const string& text) {
OverviewPageTip tip;
tip.set_link(AnchorElement(doc_url, text));
return tip;
}
void ComputeHostTips(OverviewPageRecommendation* re) {
*re->add_host_tips() = MakeOverviewPageTip(
"input_pipeline_analyzer (especially Section 3 for the breakdown of "
"input operations on the Host)");
*re->add_host_tips() = MakeOverviewPageTip(
"trace_viewer (look at the activities on the timeline of each Host "
"Thread near the bottom of the trace view)");
}
void ComputeDeviceTips(HardwareType hardware_type,
OverviewPageRecommendation* re) {
const string& device_name = HardwareType_Name(hardware_type);
string timeline_name =
(hardware_type == tensorflow::profiler::TPU) ? "TPU core" : device_name;
*re->add_device_tips() = MakeOverviewPageTip(absl::StrCat(
"op_profile (identify the time-consuming operations executed on the ",
device_name, ")"));
*re->add_device_tips() = MakeOverviewPageTip(absl::StrCat(
"trace_viewer (look at the activities on the timeline of each ",
timeline_name, " in the trace view)"));
}
void ComputeFaqTips(OverviewPageRecommendation* re) {
*re->add_faq_tips() = MakeOverviewPageTip("Refer to the Cloud tools FAQ");
}
void ComputeDocumentationTips(OverviewPageRecommendation* re) {
*re->add_documentation_tips() = MakeOverviewPageTipDocLink(
"https://www.tensorflow.org/versions/master/api_docs/python/tf/data/"
"Dataset",
"TensorFlow Input Pipeline API");
}
} // namespace
void SetCommonRecommendation(const CommonBottleneck& bottleneck,
HardwareType hardware_type,
OverviewPageRecommendation* re) {
re->set_bottleneck(bottleneck.input_classification);
re->set_statement(bottleneck.input_statement);
ComputeHostTips(re);
ComputeDeviceTips(hardware_type, re);
ComputeDocumentationTips(re);
ComputeFaqTips(re);
}
OverviewPageRecommendation ComputeGenericRecommendation(
const GenericBottleneck& bottleneck) {
OverviewPageRecommendation re;
GenericRecommendation generic;
generic.set_kernel_launch_bottleneck(bottleneck.kernel_launch_classification);
generic.set_kernel_launch_statement(bottleneck.kernel_launch_statement);
generic.set_all_other_bottleneck(bottleneck.all_other_classification);
generic.set_all_other_statement(bottleneck.all_other_statement);
re.mutable_recommendation()->PackFrom(generic);
return re;
}
OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats) {
OverviewPageAnalysis analysis;
OpMetricsDb metrics_db =
CreateTfMetricsDbFromHloMetricsDb(op_stats.device_op_metrics_db());
uint64 total_device_time_ps = metrics_db.total_time_ps();
constexpr int kNumTopOpsShown = 10;
double device_cumulative_fraction = 0.0;
for (const OpMetrics* metrics :
SortedOpMetricsDb(metrics_db, kNumTopOpsShown)) {
OverviewTfOp* op = analysis.add_top_device_ops();
op->set_name(metrics->name());
op->set_category(metrics->category());
op->set_self_time_fraction(
SafeDivide(metrics->self_time_ps(), total_device_time_ps));
device_cumulative_fraction += op->self_time_fraction();
op->set_cumulative_time_fraction(device_cumulative_fraction);
op->set_flop_rate(
SafeDivide(metrics->flops(), PicosToNanos(metrics->time_ps())));
}
return analysis;
}
OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
HardwareType hardware_type) {
OverviewPageAnalysis analysis = ComputeAnalysisResult(op_stats);
InputPipelineAnalysisResult input_analysis =
ConvertOpStatsToInputPipelineAnalysis(op_stats, hardware_type);
GenericBottleneck bottleneck = GenericOverallBottleneck(input_analysis);
OverviewPageRecommendation recommendation =
ComputeGenericRecommendation(bottleneck);
SetCommonRecommendation(bottleneck.common, hardware_type, &recommendation);
OverviewPage overview_page;
*overview_page.mutable_run_environment() = op_stats.run_environment();
*overview_page.mutable_analysis() = analysis;
*overview_page.mutable_input_analysis() = input_analysis;
*overview_page.mutable_recommendation() = recommendation;
return overview_page;
}
} // namespace profiler
} // namespace tensorflow

View File

@ -0,0 +1,45 @@
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
#define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
#include "absl/strings/string_view.h"
#include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/platform/types.h"
#include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h"
#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
#include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
#include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
namespace tensorflow {
namespace profiler {
void SetCommonRecommendation(const CommonBottleneck& bottleneck,
HardwareType hardware_type,
OverviewPageRecommendation* re);
OverviewPageRecommendation ComputeGenericRecommendation(
const GenericBottleneck& bottleneck);
OverviewPageAnalysis ComputeAnalysisResult(const OpStats& op_stats);
OverviewPage ConvertOpStatsToOverviewPage(const OpStats& op_stats,
HardwareType hardware_type);
} // namespace profiler
} // namespace tensorflow
#endif // TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_

View File

@ -40,7 +40,10 @@ tf_proto_library(
name = "overview_page_proto",
srcs = ["overview_page.proto"],
cc_api_version = 2,
protodeps = [":input_pipeline_proto"],
protodeps = [
":input_pipeline_proto",
":op_stats_proto",
],
visibility = [
":friends",
],

View File

@ -54,7 +54,7 @@ message SystemTopology {
int64 num_expected_reduced_chips = 4;
}
// Result proto for RunEnvironment (the run environment of a profiling session).
// The run environment of a profiling session.
message RunEnvironment {
// Number of hosts used.
int32 host_count = 1;
@ -71,9 +71,9 @@ message RunEnvironment {
int32 device_core_count = 5;
// The per-device-core batch size.
int32 per_core_batch_size = 6;
// Host-independent job information.
// Host-independent information about this job.
HostIndependentJobInfoResult host_independent_job_info = 7;
// Host-dependent job information.
// Host-dependent information about this job.
repeated HostDependentJobInfoResult host_dependent_job_info = 8;
// The number of replicas, corresponds to input parallelism.
// If there is no model parallelism, replica_count = device_core_count
@ -97,4 +97,6 @@ message OpStats {
PerfEnv perf_env = 3;
// The database of step sequences.
StepDatabaseResult step_db = 4;
// The run environment of this profiling session.
RunEnvironment run_environment = 5;
}

View File

@ -4,59 +4,7 @@ package tensorflow.profiler;
import "google/protobuf/any.proto";
import "tensorflow/core/profiler/protobuf/input_pipeline.proto";
// Overview result for host-independent job information.
message OverviewPageHostIndependentJobInfo {
// The CL of the build.
int64 change_list = 1;
// The time of this build (nanoseconds since the Unix epoch).
int64 build_time = 2;
// The target of this build.
string build_target = 3;
// Profiling duration (in ms).
uint32 profile_duration_ms = 4;
}
// Overview result for host-dependent job information.
message OverviewPageHostDependentJobInfo {
// The ID of the host where this job was run.
string host_id = 1;
// The command line for this run.
string command_line = 2;
// The start time of this run (nanoseconds since the Unix epoch).
int64 start_time = 3;
// BNS address specified by client at time of profiling request.
string bns_address = 4;
// Profiling start walltime (in ns).
uint64 profile_time_ns = 5;
}
// Overview result for run environment.
message OverviewPageRunEnvironment {
// Number of hosts used.
int32 host_count = 1;
// Number of tasks used.
int32 task_count = 2;
// The type of device used.
string device_type = 3;
// The number of device cores used.
// What "device core" means depends on the platform:
// For TPU, a device core is a TPU core.
// For Nvidia GPU, a device core is a GPU (not a SM).
int32 device_core_count = 4;
// The per-device-core batch size.
int32 per_core_batch_size = 5;
// Host-independent information about this job.
OverviewPageHostIndependentJobInfo host_independent_job_info = 6;
// Host-dependent information about this job.
repeated OverviewPageHostDependentJobInfo host_dependent_job_info = 7;
// The number of replicas, corresponds to input parallelism.
// If there is no model parallelism, replica_count = device_core_count
int32 replica_count = 8;
// The number of cores used for a single replica, e.g. model parallelism.
// If there is no model parallelism, then num_cores_per_replica = 1
int32 num_cores_per_replica = 9;
}
import "tensorflow/core/profiler/protobuf/op_stats.proto";
// Overview result for a TensorFlow Op.
message OverviewTfOp {
@ -138,11 +86,12 @@ message OverviewPageRecommendation {
message OverviewPage {
// The run environment of the profiled session.
OverviewPageRunEnvironment run_environment = 1;
RunEnvironment run_environment = 5;
// The step-time result.
InputPipelineAnalysisResult input_analysis = 2;
// The other analysis result.
OverviewPageAnalysis analysis = 3;
// The recommendation made to the user.
OverviewPageRecommendation recommendation = 4;
reserved 1;
}