Add suggestion and summary to CombinedTfDataStats.
PiperOrigin-RevId: 337966727 Change-Id: I7a11c83f87454551d9c33383193c3d463409ae26
This commit is contained in:
parent
73b709743a
commit
9a36023aa3
tensorflow/core/profiler
@ -714,6 +714,7 @@ cc_library(
|
||||
"//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:xplane_proto_cc",
|
||||
"//tensorflow/core/profiler/utils:group_events",
|
||||
"//tensorflow/core/profiler/utils:html_utils",
|
||||
"//tensorflow/core/profiler/utils:tf_op_utils",
|
||||
"//tensorflow/core/profiler/utils:tf_xplane_visitor",
|
||||
"//tensorflow/core/profiler/utils:timespan",
|
||||
@ -722,6 +723,7 @@ cc_library(
|
||||
"@com_google_absl//absl/container:flat_hash_map",
|
||||
"@com_google_absl//absl/container:flat_hash_set",
|
||||
"@com_google_absl//absl/strings",
|
||||
"@com_google_absl//absl/strings:str_format",
|
||||
],
|
||||
)
|
||||
|
||||
|
@ -17,12 +17,14 @@ limitations under the License.
|
||||
|
||||
#include "absl/container/flat_hash_map.h"
|
||||
#include "absl/container/flat_hash_set.h"
|
||||
#include "absl/strings/str_format.h"
|
||||
#include "absl/strings/str_split.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tensorflow/core/lib/gtl/map_util.h"
|
||||
#include "tensorflow/core/platform/protobuf.h"
|
||||
#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
|
||||
#include "tensorflow/core/profiler/utils/group_events.h"
|
||||
#include "tensorflow/core/profiler/utils/html_utils.h"
|
||||
#include "tensorflow/core/profiler/utils/tf_op_utils.h"
|
||||
#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
|
||||
#include "tensorflow/core/profiler/utils/timespan.h"
|
||||
@ -274,8 +276,150 @@ void SetBottleneckAnalysis(absl::string_view host_name,
|
||||
}
|
||||
}
|
||||
|
||||
std::string GetSuggestion(BottleneckType type) {
|
||||
constexpr absl::string_view kPlaybookLink =
|
||||
"https://www.tensorflow.org/guide/data_performance_analysis";
|
||||
constexpr absl::string_view kPlaybookSourceDatasetLink =
|
||||
"https://www.tensorflow.org/guide/"
|
||||
"data_performance_analysis#source_datasets";
|
||||
constexpr absl::string_view kPlaybookCpuUtilizationLink =
|
||||
"https://www.tensorflow.org/guide/"
|
||||
"data_performance_analysis#3_are_you_reaching_high_cpu_utilization";
|
||||
constexpr absl::string_view kPlaybookTransformationLink =
|
||||
"https://www.tensorflow.org/guide/"
|
||||
"data_performance_analysis#transformation_datasets";
|
||||
constexpr absl::string_view kTfGuideParallelDataExtractionLink =
|
||||
"https://www.tensorflow.org/guide/"
|
||||
"data_performance#parallelizing_data_extraction";
|
||||
constexpr absl::string_view kTfGuideParallelTransformationLink =
|
||||
"https://www.tensorflow.org/guide/"
|
||||
"data_performance#parallelizing_data_transformation";
|
||||
constexpr absl::string_view kTfGuideCacheLink =
|
||||
"https://www.tensorflow.org/guide/data_performance#caching";
|
||||
switch (type) {
|
||||
case BottleneckType::kSlowSource:
|
||||
return absl::StrFormat(
|
||||
"1. Check the locality of a host and input data. Ideally, they "
|
||||
"should be in the same cell (or very close, like the same "
|
||||
"region).<br/>"
|
||||
"2. Parallelize reading from this dataset source. See %s and %s for "
|
||||
"more details.<br/>",
|
||||
AnchorElement(kPlaybookSourceDatasetLink, "here"),
|
||||
AnchorElement(kTfGuideParallelDataExtractionLink, "here"));
|
||||
case BottleneckType::kSlowRemoteSource:
|
||||
return absl::StrFormat(
|
||||
"1. The remote data source is slow. Profile its host to analyze the "
|
||||
"issue further.<br/>"
|
||||
"2. See %s for other suggestions.",
|
||||
AnchorElement(kPlaybookLink, "this"));
|
||||
case BottleneckType::kSlowTransformationWithParallelVersion:
|
||||
return absl::StrFormat(
|
||||
"1. Parallelize this transformation by setting "
|
||||
"<code>num_parallel_calls=tf.data.experimental.AUTOTUNE</code>. See "
|
||||
"%s for more details.<br/>"
|
||||
"2. Consider adding <code>cache</code> after this transformation if "
|
||||
"your data fits into memory and it is appropriate (e.g., there is no "
|
||||
"randomness in upstream transformations like <code>shuffle</code>). "
|
||||
"See %s for more details.<br/>"
|
||||
"3. Find more resources %s.",
|
||||
AnchorElement(kTfGuideParallelTransformationLink, "this"),
|
||||
AnchorElement(kTfGuideCacheLink, "this"),
|
||||
AnchorElement(kPlaybookTransformationLink, "here"));
|
||||
case BottleneckType::kSlowTransformationWithoutParallelVersion:
|
||||
return absl::StrFormat(
|
||||
"1. This transformation is inherently sequential. Add outer "
|
||||
"parallelism by running multiple copies of the input pipeline over "
|
||||
"sharded inputs and combining the results. See %s for more "
|
||||
"details.<br/>"
|
||||
"2. Consider adding <code>cache</code> after this transformation if "
|
||||
"your data fits into memory and it is appropriate (e.g., there is no "
|
||||
"randomness in upstream transformations like <code>shuffle</code>). "
|
||||
"See %s for more details.<br/>"
|
||||
"3. Find more resources %s.",
|
||||
AnchorElement(kPlaybookTransformationLink, "this"),
|
||||
AnchorElement(kTfGuideCacheLink, "this"),
|
||||
AnchorElement(kPlaybookCpuUtilizationLink, "here"));
|
||||
default:
|
||||
return absl::StrFormat("See %s for suggestions.",
|
||||
AnchorElement(kPlaybookLink, "this"));
|
||||
}
|
||||
}
|
||||
|
||||
void SetSuggestion(TfDataBottleneckAnalysis* bottleneck_analysis) {
|
||||
if (bottleneck_analysis->max_latency_ps() <= kSlowCallThresholdPs) return;
|
||||
bottleneck_analysis->set_suggestion(
|
||||
GetSuggestion(GetBottleneckType(bottleneck_analysis->iterator_name())));
|
||||
}
|
||||
|
||||
void SetSummary(CombinedTfDataStats* combined_tf_data_stats) {
|
||||
int64 max_latency_ps =
|
||||
combined_tf_data_stats->bottleneck_analysis().max_latency_ps();
|
||||
if (max_latency_ps > kSlowCallThresholdPs) {
|
||||
combined_tf_data_stats->set_is_input_bound(true);
|
||||
combined_tf_data_stats->set_summary(
|
||||
"Your profile has a tf.data input pipeline slower than 50 us. Below "
|
||||
"shows a bottleneck in the slow input pipeline and a suggestion on how "
|
||||
"to fix it.");
|
||||
} else if (max_latency_ps > 0) {
|
||||
combined_tf_data_stats->set_is_input_bound(false);
|
||||
combined_tf_data_stats->set_summary(
|
||||
"Your profile does not have any tf.data input pipeline slower than 50 "
|
||||
"us. Your job could be still input bound if this profile didn't "
|
||||
"capture all workers.");
|
||||
} else {
|
||||
combined_tf_data_stats->set_is_input_bound(false);
|
||||
combined_tf_data_stats->set_summary(
|
||||
"No tf.data activitiy captured in your profile. If your job uses "
|
||||
"tf.data, try to capture a longer profile.");
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
BottleneckType GetBottleneckType(absl::string_view bottleneck_iterator_name) {
|
||||
static auto* kBottleneckTypeMap = new absl::flat_hash_map<absl::string_view,
|
||||
BottleneckType>(
|
||||
{// Read from storage.
|
||||
{"TFRecord", BottleneckType::kSlowSource},
|
||||
{"SSTable", BottleneckType::kSlowSource},
|
||||
{"RecordIO", BottleneckType::kSlowSource},
|
||||
{"Spanner", BottleneckType::kSlowSource},
|
||||
{"TFColumn", BottleneckType::kSlowSource},
|
||||
{"SleepwalkRemoteDataset", BottleneckType::kSlowSource},
|
||||
{"TextLine", BottleneckType::kSlowSource},
|
||||
{"StitchedTimelineDataset", BottleneckType::kSlowSource},
|
||||
{"DateKeyDataset", BottleneckType::kSlowSource},
|
||||
{"CapacitorProto", BottleneckType::kSlowSource},
|
||||
{"LMDB", BottleneckType::kSlowSource},
|
||||
{"ExternalDataset", BottleneckType::kSlowSource},
|
||||
{"PearModel", BottleneckType::kSlowSource},
|
||||
{"FixedLengthRecordV2", BottleneckType::kSlowSource},
|
||||
// Read from local memory.
|
||||
{"FromTensor", BottleneckType::kSlowSource},
|
||||
{"TensorSlice", BottleneckType::kSlowSource},
|
||||
{"Generator", BottleneckType::kSlowSource},
|
||||
{"SyntheticDatasetOp", BottleneckType::kSlowSource},
|
||||
// Read from remote memory.
|
||||
{"GuzzlerDataGuzzlerRemoteDataset", BottleneckType::kSlowRemoteSource},
|
||||
{"ReverbDataset", BottleneckType::kSlowRemoteSource},
|
||||
{"DatasetService", BottleneckType::kSlowRemoteSource},
|
||||
{"DatasetSampleGame", BottleneckType::kSlowRemoteSource},
|
||||
{"Courier", BottleneckType::kSlowRemoteSource},
|
||||
{"ReverbEpisodeDataset", BottleneckType::kSlowRemoteSource},
|
||||
// Transformations with parallel version.
|
||||
{"Map", BottleneckType::kSlowTransformationWithParallelVersion},
|
||||
{"Interleave", BottleneckType::kSlowTransformationWithParallelVersion},
|
||||
// Transformations without parallel version.
|
||||
{"Filter", BottleneckType::kSlowTransformationWithoutParallelVersion},
|
||||
{"Batch", BottleneckType::kSlowTransformationWithoutParallelVersion},
|
||||
{"Unbatch", BottleneckType::kSlowTransformationWithoutParallelVersion}});
|
||||
if (auto type =
|
||||
gtl::FindOrNull(*kBottleneckTypeMap, bottleneck_iterator_name)) {
|
||||
return *type;
|
||||
}
|
||||
return BottleneckType::kOther;
|
||||
}
|
||||
|
||||
void CombinedTfDataStatsBuilder::Add(absl::string_view host_name,
|
||||
XPlane* host_plane) {
|
||||
TfDataStats& tf_data_stats =
|
||||
@ -302,6 +446,8 @@ void CombinedTfDataStatsBuilder::Finalize() {
|
||||
host_name_and_tf_data_stats.second,
|
||||
bottleneck_analysis);
|
||||
}
|
||||
if (generate_suggestion_) SetSuggestion(bottleneck_analysis);
|
||||
SetSummary(combined_tf_data_stats_);
|
||||
}
|
||||
|
||||
} // namespace profiler
|
||||
|
@ -27,11 +27,23 @@ namespace profiler {
|
||||
|
||||
TF_CONST_INIT extern const int64 kSlowCallThresholdPs;
|
||||
|
||||
enum class BottleneckType {
|
||||
kSlowSource,
|
||||
kSlowRemoteSource,
|
||||
kSlowTransformationWithParallelVersion,
|
||||
kSlowTransformationWithoutParallelVersion,
|
||||
kOther,
|
||||
};
|
||||
|
||||
BottleneckType GetBottleneckType(absl::string_view bottleneck_iterator_name);
|
||||
|
||||
class CombinedTfDataStatsBuilder {
|
||||
public:
|
||||
explicit CombinedTfDataStatsBuilder(
|
||||
CombinedTfDataStats* combined_tf_data_stats)
|
||||
: combined_tf_data_stats_(combined_tf_data_stats) {}
|
||||
CombinedTfDataStats* combined_tf_data_stats,
|
||||
bool generate_suggestion = true)
|
||||
: combined_tf_data_stats_(combined_tf_data_stats),
|
||||
generate_suggestion_(generate_suggestion) {}
|
||||
|
||||
void Add(absl::string_view host_name, XPlane* host_plane);
|
||||
|
||||
@ -40,6 +52,7 @@ class CombinedTfDataStatsBuilder {
|
||||
|
||||
private:
|
||||
CombinedTfDataStats* combined_tf_data_stats_;
|
||||
bool generate_suggestion_;
|
||||
};
|
||||
|
||||
} // namespace profiler
|
||||
|
@ -77,13 +77,15 @@ TEST(XPlaneToTfDataStatsTest, HostInputPipeline) {
|
||||
CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
|
||||
builder.Add("host1", &host_plane);
|
||||
builder.Finalize();
|
||||
EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb(
|
||||
EXPECT_THAT(
|
||||
combined_tf_data_stats, EqualsProto(R"pb(
|
||||
bottleneck_analysis: {
|
||||
host: "host1"
|
||||
input_pipeline: "Host:0"
|
||||
max_latency_ps: 100000000
|
||||
iterator_name: "Range"
|
||||
iterator_long_name: "Iterator::Prefetch::Range"
|
||||
suggestion: "See <a href=\"https://www.tensorflow.org/guide/data_performance_analysis\" target=\"_blank\">this</a> for suggestions."
|
||||
}
|
||||
tf_data_stats: {
|
||||
key: "host1"
|
||||
@ -169,6 +171,8 @@ TEST(XPlaneToTfDataStatsTest, HostInputPipeline) {
|
||||
}
|
||||
}
|
||||
}
|
||||
is_input_bound: true
|
||||
summary: "Your profile has a tf.data input pipeline slower than 50 us. Below shows a bottleneck in the slow input pipeline and a suggestion on how to fix it."
|
||||
)pb"));
|
||||
}
|
||||
|
||||
@ -205,7 +209,8 @@ TEST(XPlaneToTfDataStatsTest, DeviceInputPipeline) {
|
||||
builder.Add("host1", &host_plane);
|
||||
builder.Finalize();
|
||||
// Device input pipeline is not considered for bottleneck analysis.
|
||||
EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb(
|
||||
EXPECT_THAT(
|
||||
combined_tf_data_stats, EqualsProto(R"pb(
|
||||
bottleneck_analysis: {}
|
||||
tf_data_stats: {
|
||||
key: "host1"
|
||||
@ -280,6 +285,7 @@ TEST(XPlaneToTfDataStatsTest, DeviceInputPipeline) {
|
||||
}
|
||||
}
|
||||
}
|
||||
summary: "No tf.data activitiy captured in your profile. If your job uses tf.data, try to capture a longer profile."
|
||||
)pb"));
|
||||
}
|
||||
|
||||
@ -325,13 +331,15 @@ TEST(XPlaneToTfDataStatsTest, MapAndBatch) {
|
||||
CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
|
||||
builder.Add("host1", &host_plane);
|
||||
builder.Finalize();
|
||||
EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb(
|
||||
EXPECT_THAT(
|
||||
combined_tf_data_stats, EqualsProto(R"pb(
|
||||
bottleneck_analysis: {
|
||||
host: "host1"
|
||||
input_pipeline: "Host:0"
|
||||
max_latency_ps: 100000000
|
||||
iterator_name: "Range"
|
||||
iterator_long_name: "Iterator::MapAndBatch::Range"
|
||||
suggestion: "See <a href=\"https://www.tensorflow.org/guide/data_performance_analysis\" target=\"_blank\">this</a> for suggestions."
|
||||
}
|
||||
tf_data_stats: {
|
||||
key: "host1"
|
||||
@ -392,6 +400,8 @@ TEST(XPlaneToTfDataStatsTest, MapAndBatch) {
|
||||
}
|
||||
}
|
||||
}
|
||||
is_input_bound: true
|
||||
summary: "Your profile has a tf.data input pipeline slower than 50 us. Below shows a bottleneck in the slow input pipeline and a suggestion on how to fix it."
|
||||
)pb"));
|
||||
}
|
||||
|
||||
|
@ -101,10 +101,16 @@ message TfDataBottleneckAnalysis {
|
||||
string iterator_name = 4;
|
||||
// Long name of the bottleneck iterator.
|
||||
string iterator_long_name = 5;
|
||||
// Suggestion to resolve the bottleneck.
|
||||
string suggestion = 6;
|
||||
}
|
||||
|
||||
// TfDataStats of all hosts.
|
||||
message CombinedTfDataStats {
|
||||
// Whether it is input bound.
|
||||
bool is_input_bound = 3;
|
||||
// Summary of the analysis.
|
||||
string summary = 4;
|
||||
// Bottleneck analysis result.
|
||||
TfDataBottleneckAnalysis bottleneck_analysis = 1;
|
||||
// TfDataStats per host.
|
||||
|
Loading…
Reference in New Issue
Block a user