Add suggestion and summary to CombinedTfDataStats.

PiperOrigin-RevId: 337966727
Change-Id: I7a11c83f87454551d9c33383193c3d463409ae26
This commit is contained in:
Jiho Choi 2020-10-19 17:14:51 -07:00 committed by TensorFlower Gardener
parent 73b709743a
commit 9a36023aa3
5 changed files with 404 additions and 227 deletions

View File

@ -714,6 +714,7 @@ cc_library(
"//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc", "//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
"//tensorflow/core/profiler/protobuf:xplane_proto_cc", "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
"//tensorflow/core/profiler/utils:group_events", "//tensorflow/core/profiler/utils:group_events",
"//tensorflow/core/profiler/utils:html_utils",
"//tensorflow/core/profiler/utils:tf_op_utils", "//tensorflow/core/profiler/utils:tf_op_utils",
"//tensorflow/core/profiler/utils:tf_xplane_visitor", "//tensorflow/core/profiler/utils:tf_xplane_visitor",
"//tensorflow/core/profiler/utils:timespan", "//tensorflow/core/profiler/utils:timespan",
@ -722,6 +723,7 @@ cc_library(
"@com_google_absl//absl/container:flat_hash_map", "@com_google_absl//absl/container:flat_hash_map",
"@com_google_absl//absl/container:flat_hash_set", "@com_google_absl//absl/container:flat_hash_set",
"@com_google_absl//absl/strings", "@com_google_absl//absl/strings",
"@com_google_absl//absl/strings:str_format",
], ],
) )

View File

@ -17,12 +17,14 @@ limitations under the License.
#include "absl/container/flat_hash_map.h" #include "absl/container/flat_hash_map.h"
#include "absl/container/flat_hash_set.h" #include "absl/container/flat_hash_set.h"
#include "absl/strings/str_format.h"
#include "absl/strings/str_split.h" #include "absl/strings/str_split.h"
#include "absl/strings/string_view.h" #include "absl/strings/string_view.h"
#include "tensorflow/core/lib/gtl/map_util.h" #include "tensorflow/core/lib/gtl/map_util.h"
#include "tensorflow/core/platform/protobuf.h" #include "tensorflow/core/platform/protobuf.h"
#include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h" #include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
#include "tensorflow/core/profiler/utils/group_events.h" #include "tensorflow/core/profiler/utils/group_events.h"
#include "tensorflow/core/profiler/utils/html_utils.h"
#include "tensorflow/core/profiler/utils/tf_op_utils.h" #include "tensorflow/core/profiler/utils/tf_op_utils.h"
#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h" #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
#include "tensorflow/core/profiler/utils/timespan.h" #include "tensorflow/core/profiler/utils/timespan.h"
@ -274,8 +276,150 @@ void SetBottleneckAnalysis(absl::string_view host_name,
} }
} }
std::string GetSuggestion(BottleneckType type) {
constexpr absl::string_view kPlaybookLink =
"https://www.tensorflow.org/guide/data_performance_analysis";
constexpr absl::string_view kPlaybookSourceDatasetLink =
"https://www.tensorflow.org/guide/"
"data_performance_analysis#source_datasets";
constexpr absl::string_view kPlaybookCpuUtilizationLink =
"https://www.tensorflow.org/guide/"
"data_performance_analysis#3_are_you_reaching_high_cpu_utilization";
constexpr absl::string_view kPlaybookTransformationLink =
"https://www.tensorflow.org/guide/"
"data_performance_analysis#transformation_datasets";
constexpr absl::string_view kTfGuideParallelDataExtractionLink =
"https://www.tensorflow.org/guide/"
"data_performance#parallelizing_data_extraction";
constexpr absl::string_view kTfGuideParallelTransformationLink =
"https://www.tensorflow.org/guide/"
"data_performance#parallelizing_data_transformation";
constexpr absl::string_view kTfGuideCacheLink =
"https://www.tensorflow.org/guide/data_performance#caching";
switch (type) {
case BottleneckType::kSlowSource:
return absl::StrFormat(
"1. Check the locality of a host and input data. Ideally, they "
"should be in the same cell (or very close, like the same "
"region).<br/>"
"2. Parallelize reading from this dataset source. See %s and %s for "
"more details.<br/>",
AnchorElement(kPlaybookSourceDatasetLink, "here"),
AnchorElement(kTfGuideParallelDataExtractionLink, "here"));
case BottleneckType::kSlowRemoteSource:
return absl::StrFormat(
"1. The remote data source is slow. Profile its host to analyze the "
"issue further.<br/>"
"2. See %s for other suggestions.",
AnchorElement(kPlaybookLink, "this"));
case BottleneckType::kSlowTransformationWithParallelVersion:
return absl::StrFormat(
"1. Parallelize this transformation by setting "
"<code>num_parallel_calls=tf.data.experimental.AUTOTUNE</code>. See "
"%s for more details.<br/>"
"2. Consider adding <code>cache</code> after this transformation if "
"your data fits into memory and it is appropriate (e.g., there is no "
"randomness in upstream transformations like <code>shuffle</code>). "
"See %s for more details.<br/>"
"3. Find more resources %s.",
AnchorElement(kTfGuideParallelTransformationLink, "this"),
AnchorElement(kTfGuideCacheLink, "this"),
AnchorElement(kPlaybookTransformationLink, "here"));
case BottleneckType::kSlowTransformationWithoutParallelVersion:
return absl::StrFormat(
"1. This transformation is inherently sequential. Add outer "
"parallelism by running multiple copies of the input pipeline over "
"sharded inputs and combining the results. See %s for more "
"details.<br/>"
"2. Consider adding <code>cache</code> after this transformation if "
"your data fits into memory and it is appropriate (e.g., there is no "
"randomness in upstream transformations like <code>shuffle</code>). "
"See %s for more details.<br/>"
"3. Find more resources %s.",
AnchorElement(kPlaybookTransformationLink, "this"),
AnchorElement(kTfGuideCacheLink, "this"),
AnchorElement(kPlaybookCpuUtilizationLink, "here"));
default:
return absl::StrFormat("See %s for suggestions.",
AnchorElement(kPlaybookLink, "this"));
}
}
void SetSuggestion(TfDataBottleneckAnalysis* bottleneck_analysis) {
if (bottleneck_analysis->max_latency_ps() <= kSlowCallThresholdPs) return;
bottleneck_analysis->set_suggestion(
GetSuggestion(GetBottleneckType(bottleneck_analysis->iterator_name())));
}
void SetSummary(CombinedTfDataStats* combined_tf_data_stats) {
int64 max_latency_ps =
combined_tf_data_stats->bottleneck_analysis().max_latency_ps();
if (max_latency_ps > kSlowCallThresholdPs) {
combined_tf_data_stats->set_is_input_bound(true);
combined_tf_data_stats->set_summary(
"Your profile has a tf.data input pipeline slower than 50 us. Below "
"shows a bottleneck in the slow input pipeline and a suggestion on how "
"to fix it.");
} else if (max_latency_ps > 0) {
combined_tf_data_stats->set_is_input_bound(false);
combined_tf_data_stats->set_summary(
"Your profile does not have any tf.data input pipeline slower than 50 "
"us. Your job could be still input bound if this profile didn't "
"capture all workers.");
} else {
combined_tf_data_stats->set_is_input_bound(false);
combined_tf_data_stats->set_summary(
"No tf.data activitiy captured in your profile. If your job uses "
"tf.data, try to capture a longer profile.");
}
}
} // namespace } // namespace
BottleneckType GetBottleneckType(absl::string_view bottleneck_iterator_name) {
static auto* kBottleneckTypeMap = new absl::flat_hash_map<absl::string_view,
BottleneckType>(
{// Read from storage.
{"TFRecord", BottleneckType::kSlowSource},
{"SSTable", BottleneckType::kSlowSource},
{"RecordIO", BottleneckType::kSlowSource},
{"Spanner", BottleneckType::kSlowSource},
{"TFColumn", BottleneckType::kSlowSource},
{"SleepwalkRemoteDataset", BottleneckType::kSlowSource},
{"TextLine", BottleneckType::kSlowSource},
{"StitchedTimelineDataset", BottleneckType::kSlowSource},
{"DateKeyDataset", BottleneckType::kSlowSource},
{"CapacitorProto", BottleneckType::kSlowSource},
{"LMDB", BottleneckType::kSlowSource},
{"ExternalDataset", BottleneckType::kSlowSource},
{"PearModel", BottleneckType::kSlowSource},
{"FixedLengthRecordV2", BottleneckType::kSlowSource},
// Read from local memory.
{"FromTensor", BottleneckType::kSlowSource},
{"TensorSlice", BottleneckType::kSlowSource},
{"Generator", BottleneckType::kSlowSource},
{"SyntheticDatasetOp", BottleneckType::kSlowSource},
// Read from remote memory.
{"GuzzlerDataGuzzlerRemoteDataset", BottleneckType::kSlowRemoteSource},
{"ReverbDataset", BottleneckType::kSlowRemoteSource},
{"DatasetService", BottleneckType::kSlowRemoteSource},
{"DatasetSampleGame", BottleneckType::kSlowRemoteSource},
{"Courier", BottleneckType::kSlowRemoteSource},
{"ReverbEpisodeDataset", BottleneckType::kSlowRemoteSource},
// Transformations with parallel version.
{"Map", BottleneckType::kSlowTransformationWithParallelVersion},
{"Interleave", BottleneckType::kSlowTransformationWithParallelVersion},
// Transformations without parallel version.
{"Filter", BottleneckType::kSlowTransformationWithoutParallelVersion},
{"Batch", BottleneckType::kSlowTransformationWithoutParallelVersion},
{"Unbatch", BottleneckType::kSlowTransformationWithoutParallelVersion}});
if (auto type =
gtl::FindOrNull(*kBottleneckTypeMap, bottleneck_iterator_name)) {
return *type;
}
return BottleneckType::kOther;
}
void CombinedTfDataStatsBuilder::Add(absl::string_view host_name, void CombinedTfDataStatsBuilder::Add(absl::string_view host_name,
XPlane* host_plane) { XPlane* host_plane) {
TfDataStats& tf_data_stats = TfDataStats& tf_data_stats =
@ -302,6 +446,8 @@ void CombinedTfDataStatsBuilder::Finalize() {
host_name_and_tf_data_stats.second, host_name_and_tf_data_stats.second,
bottleneck_analysis); bottleneck_analysis);
} }
if (generate_suggestion_) SetSuggestion(bottleneck_analysis);
SetSummary(combined_tf_data_stats_);
} }
} // namespace profiler } // namespace profiler

View File

@ -27,11 +27,23 @@ namespace profiler {
TF_CONST_INIT extern const int64 kSlowCallThresholdPs; TF_CONST_INIT extern const int64 kSlowCallThresholdPs;
enum class BottleneckType {
kSlowSource,
kSlowRemoteSource,
kSlowTransformationWithParallelVersion,
kSlowTransformationWithoutParallelVersion,
kOther,
};
BottleneckType GetBottleneckType(absl::string_view bottleneck_iterator_name);
class CombinedTfDataStatsBuilder { class CombinedTfDataStatsBuilder {
public: public:
explicit CombinedTfDataStatsBuilder( explicit CombinedTfDataStatsBuilder(
CombinedTfDataStats* combined_tf_data_stats) CombinedTfDataStats* combined_tf_data_stats,
: combined_tf_data_stats_(combined_tf_data_stats) {} bool generate_suggestion = true)
: combined_tf_data_stats_(combined_tf_data_stats),
generate_suggestion_(generate_suggestion) {}
void Add(absl::string_view host_name, XPlane* host_plane); void Add(absl::string_view host_name, XPlane* host_plane);
@ -40,6 +52,7 @@ class CombinedTfDataStatsBuilder {
private: private:
CombinedTfDataStats* combined_tf_data_stats_; CombinedTfDataStats* combined_tf_data_stats_;
bool generate_suggestion_;
}; };
} // namespace profiler } // namespace profiler

View File

@ -77,99 +77,103 @@ TEST(XPlaneToTfDataStatsTest, HostInputPipeline) {
CombinedTfDataStatsBuilder builder(&combined_tf_data_stats); CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
builder.Add("host1", &host_plane); builder.Add("host1", &host_plane);
builder.Finalize(); builder.Finalize();
EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb( EXPECT_THAT(
bottleneck_analysis: { combined_tf_data_stats, EqualsProto(R"pb(
host: "host1" bottleneck_analysis: {
input_pipeline: "Host:0" host: "host1"
max_latency_ps: 100000000 input_pipeline: "Host:0"
iterator_name: "Range" max_latency_ps: 100000000
iterator_long_name: "Iterator::Prefetch::Range" iterator_name: "Range"
} iterator_long_name: "Iterator::Prefetch::Range"
tf_data_stats: { suggestion: "See <a href=\"https://www.tensorflow.org/guide/data_performance_analysis\" target=\"_blank\">this</a> for suggestions."
key: "host1" }
value: { tf_data_stats: {
iterator_metadata: { key: "host1"
key: 123, value: {
value: { iterator_metadata: {
id: 123 key: 123,
name: "Prefetch" value: {
long_name: "Iterator::Prefetch" id: 123
is_async: true name: "Prefetch"
} long_name: "Iterator::Prefetch"
is_async: true
}
}
iterator_metadata: {
key: 456,
value: {
id: 456
parent_id: 123
name: "Range"
long_name: "Iterator::Prefetch::Range"
is_async: false
}
}
input_pipelines {
key: 123,
value: {
metadata { id: 123 type: HOST name: "Host:0" }
avg_latency_ps: 60000000
min_latency_ps: 20000000
max_latency_ps: 100000000
num_slow_calls: 1
stats {
bottleneck_iterator_id: 456
iterator_stats {
key: 123,
value: {
id: 123
start_time_ps: 0
duration_ps: 100000000
self_time_ps: 20000000
is_blocking: true
num_calls: 1
} }
iterator_metadata: { }
key: 456, iterator_stats {
value: { key: 456,
id: 456 value: {
parent_id: 123 id: 456
name: "Range" start_time_ps: 0
long_name: "Iterator::Prefetch::Range" duration_ps: 80000000
is_async: false self_time_ps: 80000000
} is_blocking: true
} num_calls: 1
input_pipelines {
key: 123,
value: {
metadata { id: 123 type: HOST name: "Host:0" }
avg_latency_ps: 60000000
min_latency_ps: 20000000
max_latency_ps: 100000000
num_slow_calls: 1
stats {
bottleneck_iterator_id: 456
iterator_stats {
key: 123,
value: {
id: 123
start_time_ps: 0
duration_ps: 100000000
self_time_ps: 20000000
is_blocking: true
num_calls: 1
}
}
iterator_stats {
key: 456,
value: {
id: 456
start_time_ps: 0
duration_ps: 80000000
self_time_ps: 80000000
is_blocking: true
num_calls: 1
}
}
}
stats {
bottleneck_iterator_id: 123
iterator_stats {
key: 123,
value: {
id: 123
start_time_ps: 200000000
duration_ps: 20000000
self_time_ps: 20000000
is_blocking: true
num_calls: 1
}
}
iterator_stats {
key: 456,
value: {
id: 456
start_time_ps: 100000000
duration_ps: 80000000
self_time_ps: 80000000
is_blocking: false
num_calls: 1
}
}
}
}
} }
} }
} }
)pb")); stats {
bottleneck_iterator_id: 123
iterator_stats {
key: 123,
value: {
id: 123
start_time_ps: 200000000
duration_ps: 20000000
self_time_ps: 20000000
is_blocking: true
num_calls: 1
}
}
iterator_stats {
key: 456,
value: {
id: 456
start_time_ps: 100000000
duration_ps: 80000000
self_time_ps: 80000000
is_blocking: false
num_calls: 1
}
}
}
}
}
}
}
is_input_bound: true
summary: "Your profile has a tf.data input pipeline slower than 50 us. Below shows a bottleneck in the slow input pipeline and a suggestion on how to fix it."
)pb"));
} }
TEST(XPlaneToTfDataStatsTest, DeviceInputPipeline) { TEST(XPlaneToTfDataStatsTest, DeviceInputPipeline) {
@ -205,82 +209,84 @@ TEST(XPlaneToTfDataStatsTest, DeviceInputPipeline) {
builder.Add("host1", &host_plane); builder.Add("host1", &host_plane);
builder.Finalize(); builder.Finalize();
// Device input pipeline is not considered for bottleneck analysis. // Device input pipeline is not considered for bottleneck analysis.
EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb( EXPECT_THAT(
bottleneck_analysis: {} combined_tf_data_stats, EqualsProto(R"pb(
tf_data_stats: { bottleneck_analysis: {}
key: "host1" tf_data_stats: {
value: { key: "host1"
iterator_metadata: { value: {
key: 123, iterator_metadata: {
value: { key: 123,
id: 123 value: {
name: "Prefetch" id: 123
long_name: "Iterator::Prefetch" name: "Prefetch"
is_async: true long_name: "Iterator::Prefetch"
} is_async: true
}
}
iterator_metadata: {
key: 456,
value: {
id: 456
parent_id: 123
name: "Generator"
long_name: "Iterator::Prefetch::Generator"
is_async: false
}
}
input_pipelines {
key: 123,
value: {
metadata { id: 123 type: DEVICE name: "Device:0" }
avg_latency_ps: 65000000
min_latency_ps: 30000000
max_latency_ps: 100000000
num_slow_calls: 1
stats {
bottleneck_iterator_id: 456
iterator_stats {
key: 123,
value: {
id: 123
start_time_ps: 100000000
duration_ps: 100000000
self_time_ps: 20000000
is_blocking: true
num_calls: 1
} }
iterator_metadata: { }
key: 456, iterator_stats {
value: { key: 456,
id: 456 value: {
parent_id: 123 id: 456
name: "Generator" start_time_ps: 100000000
long_name: "Iterator::Prefetch::Generator" duration_ps: 80000000
is_async: false self_time_ps: 80000000
} is_blocking: true
} num_calls: 1
input_pipelines {
key: 123,
value: {
metadata { id: 123 type: DEVICE name: "Device:0" }
avg_latency_ps: 65000000
min_latency_ps: 30000000
max_latency_ps: 100000000
num_slow_calls: 1
stats {
bottleneck_iterator_id: 456
iterator_stats {
key: 123,
value: {
id: 123
start_time_ps: 100000000
duration_ps: 100000000
self_time_ps: 20000000
is_blocking: true
num_calls: 1
}
}
iterator_stats {
key: 456,
value: {
id: 456
start_time_ps: 100000000
duration_ps: 80000000
self_time_ps: 80000000
is_blocking: true
num_calls: 1
}
}
}
stats {
bottleneck_iterator_id: 123
iterator_stats {
key: 123,
value: {
id: 123
start_time_ps: 0
duration_ps: 30000000
self_time_ps: 30000000
is_blocking: true
num_calls: 1
}
}
}
}
} }
} }
} }
)pb")); stats {
bottleneck_iterator_id: 123
iterator_stats {
key: 123,
value: {
id: 123
start_time_ps: 0
duration_ps: 30000000
self_time_ps: 30000000
is_blocking: true
num_calls: 1
}
}
}
}
}
}
}
summary: "No tf.data activitiy captured in your profile. If your job uses tf.data, try to capture a longer profile."
)pb"));
} }
// Test with the following example dataset: // Test with the following example dataset:
@ -325,74 +331,78 @@ TEST(XPlaneToTfDataStatsTest, MapAndBatch) {
CombinedTfDataStatsBuilder builder(&combined_tf_data_stats); CombinedTfDataStatsBuilder builder(&combined_tf_data_stats);
builder.Add("host1", &host_plane); builder.Add("host1", &host_plane);
builder.Finalize(); builder.Finalize();
EXPECT_THAT(combined_tf_data_stats, EqualsProto(R"pb( EXPECT_THAT(
bottleneck_analysis: { combined_tf_data_stats, EqualsProto(R"pb(
host: "host1" bottleneck_analysis: {
input_pipeline: "Host:0" host: "host1"
max_latency_ps: 100000000 input_pipeline: "Host:0"
iterator_name: "Range" max_latency_ps: 100000000
iterator_long_name: "Iterator::MapAndBatch::Range" iterator_name: "Range"
} iterator_long_name: "Iterator::MapAndBatch::Range"
tf_data_stats: { suggestion: "See <a href=\"https://www.tensorflow.org/guide/data_performance_analysis\" target=\"_blank\">this</a> for suggestions."
key: "host1" }
value: { tf_data_stats: {
iterator_metadata: { key: "host1"
key: 123, value: {
value: { iterator_metadata: {
id: 123 key: 123,
name: "MapAndBatch" value: {
long_name: "Iterator::MapAndBatch" id: 123
is_async: true name: "MapAndBatch"
} long_name: "Iterator::MapAndBatch"
is_async: true
}
}
iterator_metadata: {
key: 456,
value: {
id: 456
parent_id: 123
name: "Range"
long_name: "Iterator::MapAndBatch::Range"
is_async: false
}
}
input_pipelines {
key: 123,
value: {
metadata { id: 123 type: HOST name: "Host:0" }
avg_latency_ps: 100000000
min_latency_ps: 100000000
max_latency_ps: 100000000
num_slow_calls: 1
stats {
bottleneck_iterator_id: 456
iterator_stats {
key: 123,
value: {
id: 123
start_time_ps: 0
duration_ps: 100000000
self_time_ps: 40000000
is_blocking: true
num_calls: 1
} }
iterator_metadata: { }
key: 456, iterator_stats {
value: { key: 456,
id: 456 value: {
parent_id: 123 id: 456
name: "Range" start_time_ps: 0
long_name: "Iterator::MapAndBatch::Range" duration_ps: 60000000
is_async: false self_time_ps: 60000000
} is_blocking: true
} num_calls: 2
input_pipelines {
key: 123,
value: {
metadata { id: 123 type: HOST name: "Host:0" }
avg_latency_ps: 100000000
min_latency_ps: 100000000
max_latency_ps: 100000000
num_slow_calls: 1
stats {
bottleneck_iterator_id: 456
iterator_stats {
key: 123,
value: {
id: 123
start_time_ps: 0
duration_ps: 100000000
self_time_ps: 40000000
is_blocking: true
num_calls: 1
}
}
iterator_stats {
key: 456,
value: {
id: 456
start_time_ps: 0
duration_ps: 60000000
self_time_ps: 60000000
is_blocking: true
num_calls: 2
}
}
}
}
} }
} }
} }
)pb")); }
}
}
}
is_input_bound: true
summary: "Your profile has a tf.data input pipeline slower than 50 us. Below shows a bottleneck in the slow input pipeline and a suggestion on how to fix it."
)pb"));
} }
} // namespace } // namespace

View File

@ -101,10 +101,16 @@ message TfDataBottleneckAnalysis {
string iterator_name = 4; string iterator_name = 4;
// Long name of the bottleneck iterator. // Long name of the bottleneck iterator.
string iterator_long_name = 5; string iterator_long_name = 5;
// Suggestion to resolve the bottleneck.
string suggestion = 6;
} }
// TfDataStats of all hosts. // TfDataStats of all hosts.
message CombinedTfDataStats { message CombinedTfDataStats {
// Whether it is input bound.
bool is_input_bound = 3;
// Summary of the analysis.
string summary = 4;
// Bottleneck analysis result. // Bottleneck analysis result.
TfDataBottleneckAnalysis bottleneck_analysis = 1; TfDataBottleneckAnalysis bottleneck_analysis = 1;
// TfDataStats per host. // TfDataStats per host.