Add suggestion and summary to CombinedTfDataStats.

PiperOrigin-RevId: 337966727
Change-Id: I7a11c83f87454551d9c33383193c3d463409ae26
This commit is contained in:
Jiho Choi 2020-10-19 17:14:51 -07:00 committed by TensorFlower Gardener
parent 73b709743a
commit 9a36023aa3
5 changed files with 404 additions and 227 deletions

View File

@ -714,6 +714,7 @@ cc_library(
"//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
"//tensorflow/core/profiler/protobuf:xplane_proto_cc",
"//tensorflow/core/profiler/utils:group_events",
"//tensorflow/core/profiler/utils:html_utils",
"//tensorflow/core/profiler/utils:tf_op_utils",
"//tensorflow/core/profiler/utils:tf_xplane_visitor",
"//tensorflow/core/profiler/utils:timespan",
@ -722,6 +723,7 @@ cc_library(
"@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
],
)

View File

@ -17,12 +17,14 @@ limitations under the License.
#include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h"
#include "absl/strings/str_format.h"
#include "absl/strings/str_split.h"
#include "absl/strings/string_view.h"
#include "tensorflow/core/lib/gtl/map_util.h"
#include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
#include "tensorflow/core/profiler/utils/group_events.h"
#include "tensorflow/core/profiler/utils/html_utils.h"
#include "tensorflow/core/profiler/utils/tf_op_utils.h"
#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
#include "tensorflow/core/profiler/utils/timespan.h"
@ -274,8 +276,150 @@ void SetBottleneckAnalysis(absl::string_view host_name,
}
}
std::string GetSuggestion(BottleneckType type) {
constexpr absl::string_view kPlaybookLink =
"https://www.tensorflow.org/guide/data_performance_analysis";
constexpr absl::string_view kPlaybookSourceDatasetLink =
"https://www.tensorflow.org/guide/"
"data_performance_analysis#source_datasets";
constexpr absl::string_view kPlaybookCpuUtilizationLink =
"https://www.tensorflow.org/guide/"
"data_performance_analysis#3_are_you_reaching_high_cpu_utilization";
constexpr absl::string_view kPlaybookTransformationLink =
"https://www.tensorflow.org/guide/"
"data_performance_analysis#transformation_datasets";
constexpr absl::string_view kTfGuideParallelDataExtractionLink =
"https://www.tensorflow.org/guide/"
"data_performance#parallelizing_data_extraction";
constexpr absl::string_view kTfGuideParallelTransformationLink =
"https://www.tensorflow.org/guide/"
"data_performance#parallelizing_data_transformation";
constexpr absl::string_view kTfGuideCacheLink =
"https://www.tensorflow.org/guide/data_performance#caching";
switch (type) {
case BottleneckType::kSlowSource:
return absl::StrFormat(
"1. Check the locality of a host and input data. Ideally, they "
"should be in the same cell (or very close, like the same "
"region).<br/>"
"2. Parallelize reading from this dataset source. See %s and %s for "
"more details.<br/>",
AnchorElement(kPlaybookSourceDatasetLink, "here"),
AnchorElement(kTfGuideParallelDataExtractionLink, "here"));
case BottleneckType::kSlowRemoteSource:
return absl::StrFormat(
"1. The remote data source is slow. Profile its host to analyze the "
"issue further.<br/>"
"2. See %s for other suggestions.",
AnchorElement(kPlaybookLink, "this"));
case BottleneckType::kSlowTransformationWithParallelVersion:
return absl::StrFormat(
"1. Parallelize this transformation by setting "
"<code>num_parallel_calls=tf.data.experimental.AUTOTUNE</code>. See "
"%s for more details.<br/>"
"2. Consider adding <code>cache</code> after this transformation if "
"your data fits into memory and it is appropriate (e.g., there is no "
"randomness in upstream transformations like <code>shuffle</code>). "
"See %s for more details.<br/>"
"3. Find more resources %s.",
AnchorElement(kTfGuideParallelTransformationLink, "this"),
AnchorElement(kTfGuideCacheLink, "this"),
AnchorElement(kPlaybookTransformationLink, "here"));
case BottleneckType::kSlowTransformationWithoutParallelVersion:
return absl::StrFormat(
"1. This transformation is inherently sequential. Add outer "
"parallelism by running multiple copies of the input pipeline over "
"sharded inputs and combining the results. See %s for more "
"details.<br/>"
"2. Consider adding <code>cache</code> after this transformation if "
"your data fits into memory and it is appropriate (e.g., there is no "
"randomness in upstream transformations like <code>shuffle</code>). "
"See %s for more details.<br/>"
"3. Find more resources %s.",
AnchorElement(kPlaybookTransformationLink, "this"),
AnchorElement(kTfGuideCacheLink, "this"),
AnchorElement(kPlaybookCpuUtilizationLink, "here"));
default:
return absl::StrFormat("See %s for suggestions.",
AnchorElement(kPlaybookLink, "this"));
}
}
void SetSuggestion(TfDataBottleneckAnalysis* bottleneck_analysis) {
if (bottleneck_analysis->max_latency_ps() <= kSlowCallThresholdPs) return;
bottleneck_analysis->set_suggestion(
GetSuggestion(GetBottleneckType(bottleneck_analysis->iterator_name())));
}
void SetSummary(CombinedTfDataStats* combined_tf_data_stats) {
int64 max_latency_ps =
combined_tf_data_stats->bottleneck_analysis().max_latency_ps();
if (max_latency_ps > kSlowCallThresholdPs) {
combined_tf_data_stats->set_is_input_bound(true);
combined_tf_data_stats->set_summary(
"Your profile has a tf.data input pipeline slower than 50 us. Below "
"shows a bottleneck in the slow input pipeline and a suggestion on how "
"to fix it.");
} else if (max_latency_ps > 0) {
combined_tf_data_stats->set_is_input_bound(false);
combined_tf_data_stats->set_summary(
"Your profile does not have any tf.data input pipeline slower than 50 "
"us. Your job could be still input bound if this profile didn't "
"capture all workers.");
} else {
combined_tf_data_stats->set_is_input_bound(false);
combined_tf_data_stats->set_summary(
"No tf.data activitiy captured in your profile. If your job uses "
"tf.data, try to capture a longer profile.");
}
}
} // namespace
BottleneckType GetBottleneckType(absl::string_view bottleneck_iterator_name) {
static auto* kBottleneckTypeMap = new absl::flat_hash_map<absl::string_view,
BottleneckType>(
{// Read from storage.
{"TFRecord", BottleneckType::kSlowSource},
{"SSTable", BottleneckType::kSlowSource},
{"RecordIO", BottleneckType::kSlowSource},
{"Spanner", BottleneckType::kSlowSource},
{"TFColumn", BottleneckType::kSlowSource},
{"SleepwalkRemoteDataset", BottleneckType::kSlowSource},
{"TextLine", BottleneckType::kSlowSource},
{"StitchedTimelineDataset", BottleneckType::kSlowSource},
{"DateKeyDataset", BottleneckType::kSlowSource},
{"CapacitorProto", BottleneckType::kSlowSource},
{"LMDB", BottleneckType::kSlowSource},
{"ExternalDataset", BottleneckType::kSlowSource},
{"PearModel", BottleneckType::kSlowSource},
{"FixedLengthRecordV2", BottleneckType::kSlowSource},
// Read from local memory.
{"FromTensor", BottleneckType::kSlowSource},
{"TensorSlice", BottleneckType::kSlowSource},
{"Generator", BottleneckType::kSlowSource},
{"SyntheticDatasetOp", BottleneckType::kSlowSource},
// Read from remote memory.
{"GuzzlerDataGuzzlerRemoteDataset", BottleneckType::kSlowRemoteSource},
{"ReverbDataset", BottleneckType::kSlowRemoteSource},
{"DatasetService", BottleneckType::kSlowRemoteSource},
{"DatasetSampleGame", BottleneckType::kSlowRemoteSource},
{"Courier", BottleneckType::kSlowRemoteSource},
{"ReverbEpisodeDataset", BottleneckType::kSlowRemoteSource},
// Transformations with parallel version.
{"Map", BottleneckType::kSlowTransformationWithParallelVersion},
{"Interleave", BottleneckType::kSlowTransformationWithParallelVersion},
// Transformations without parallel version.
{"Filter", BottleneckType::kSlowTransformationWithoutParallelVersion},
{"Batch", BottleneckType::kSlowTransformationWithoutParallelVersion},
{"Unbatch", BottleneckType::kSlowTransformationWithoutParallelVersion}});
if (auto type =
gtl::FindOrNull(*kBottleneckTypeMap, bottleneck_iterator_name)) {
return *type;
}
return BottleneckType::kOther;
}
void CombinedTfDataStatsBuilder::Add(absl::string_view host_name,
XPlane* host_plane) {
TfDataStats& tf_data_stats =
@ -302,6 +446,8 @@ void CombinedTfDataStatsBuilder::Finalize() {
host_name_and_tf_data_stats.second,
bottleneck_analysis);
}
if (generate_suggestion_) SetSuggestion(bottleneck_analysis);
SetSummary(combined_tf_data_stats_);
}
} // namespace profiler

View File

@ -27,11 +27,23 @@ namespace profiler {
TF_CONST_INIT extern const int64 kSlowCallThresholdPs;
enum class BottleneckType {
kSlowSource,
kSlowRemoteSource,
kSlowTransformationWithParallelVersion,
kSlowTransformationWithoutParallelVersion,
kOther,
};
BottleneckType GetBottleneckType(absl::string_view bottleneck_iterator_name);
class CombinedTfDataStatsBuilder {
public:
explicit CombinedTfDataStatsBuilder(
CombinedTfDataStats* combined_tf_data_stats)
: combined_tf_data_stats_(combined_tf_data_stats) {}
CombinedTfDataStats* combined_tf_data_stats,
bool generate_suggestion = true)
: combined_tf_data_stats_(combined_tf_data_stats),
generate_suggestion_(generate_suggestion) {}
void Add(absl::string_view host_name, XPlane* host_plane);
@ -40,6 +52,7 @@ class CombinedTfDataStatsBuilder {
private:
CombinedTfDataStats* combined_tf_data_stats_;
bool generate_suggestion_;
};
} // namespace profiler

View File

@ -77,13 +77,15 @@ TEST(XPlaneToTfDataStatsTest, HostInputPipeline) {
CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
builder.Add("host1", &host_plane);
builder.Finalize();
EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb(
EXPECT_THAT(
combined_tf_data_stats, EqualsProto(R"pb(
bottleneck_analysis: {
host: "host1"
input_pipeline: "Host:0"
max_latency_ps: 100000000
iterator_name: "Range"
iterator_long_name: "Iterator::Prefetch::Range"
suggestion: "See <a href=\"https://www.tensorflow.org/guide/data_performance_analysis\" target=\"_blank\">this</a> for suggestions."
}
tf_data_stats: {
key: "host1"
@ -169,6 +171,8 @@ TEST(XPlaneToTfDataStatsTest, HostInputPipeline) {
}
}
}
is_input_bound: true
summary: "Your profile has a tf.data input pipeline slower than 50 us. Below shows a bottleneck in the slow input pipeline and a suggestion on how to fix it."
)pb"));
}
@ -205,7 +209,8 @@ TEST(XPlaneToTfDataStatsTest, DeviceInputPipeline) {
builder.Add("host1", &host_plane);
builder.Finalize();
// Device input pipeline is not considered for bottleneck analysis.
EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb(
EXPECT_THAT(
combined_tf_data_stats, EqualsProto(R"pb(
bottleneck_analysis: {}
tf_data_stats: {
key: "host1"
@ -280,6 +285,7 @@ TEST(XPlaneToTfDataStatsTest, DeviceInputPipeline) {
}
}
}
summary: "No tf.data activitiy captured in your profile. If your job uses tf.data, try to capture a longer profile."
)pb"));
}
@ -325,13 +331,15 @@ TEST(XPlaneToTfDataStatsTest, MapAndBatch) {
CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
builder.Add("host1", &host_plane);
builder.Finalize();
EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb(
EXPECT_THAT(
combined_tf_data_stats, EqualsProto(R"pb(
bottleneck_analysis: {
host: "host1"
input_pipeline: "Host:0"
max_latency_ps: 100000000
iterator_name: "Range"
iterator_long_name: "Iterator::MapAndBatch::Range"
suggestion: "See <a href=\"https://www.tensorflow.org/guide/data_performance_analysis\" target=\"_blank\">this</a> for suggestions."
}
tf_data_stats: {
key: "host1"
@ -392,6 +400,8 @@ TEST(XPlaneToTfDataStatsTest, MapAndBatch) {
}
}
}
is_input_bound: true
summary: "Your profile has a tf.data input pipeline slower than 50 us. Below shows a bottleneck in the slow input pipeline and a suggestion on how to fix it."
)pb"));
}

View File

@ -101,10 +101,16 @@ message TfDataBottleneckAnalysis {
string iterator_name = 4;
// Long name of the bottleneck iterator.
string iterator_long_name = 5;
// Suggestion to resolve the bottleneck.
string suggestion = 6;
}
// TfDataStats of all hosts.
message CombinedTfDataStats {
// Whether it is input bound.
bool is_input_bound = 3;
// Summary of the analysis.
string summary = 4;
// Bottleneck analysis result.
TfDataBottleneckAnalysis bottleneck_analysis = 1;
// TfDataStats per host.