From 2882df7cf2191cd55f251a1266d20ec3d6c9d6db Mon Sep 17 00:00:00 2001 From: "A. Unique TensorFlower" Date: Wed, 14 Oct 2020 16:11:49 -0700 Subject: [PATCH] Add Hostname to XPlane and OpStats in OSS. Call OpStatsToPodViewer converter in OSS. PiperOrigin-RevId: 337194132 Change-Id: Idd2de2066012a773eda2cec7d11542d09ecbbfa0 --- tensorflow/core/profiler/convert/BUILD | 3 ++ .../profiler/convert/xplane_to_op_stats.cc | 5 +++ .../convert/xplane_to_op_stats_test.cc | 45 ++++++++++++++----- .../profiler/convert/xplane_to_tools_data.cc | 21 +++++++++ .../core/profiler/protobuf/xplane.proto | 4 +- .../profiler/rpc/profiler_service_impl.cc | 1 + tensorflow/python/profiler/internal/BUILD | 1 + .../profiler/internal/profiler_wrapper.cc | 2 + 8 files changed, 71 insertions(+), 11 deletions(-) diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD index e239b99e7bb..2ac68207ee6 100644 --- a/tensorflow/core/profiler/convert/BUILD +++ b/tensorflow/core/profiler/convert/BUILD @@ -356,6 +356,7 @@ tf_cc_test( size = "small", srcs = ["xplane_to_op_stats_test.cc"], deps = [ + ":step_events_to_steps_db", ":xplane_to_op_stats", ":xplane_to_tf_functions", "//tensorflow/core:lib", @@ -684,6 +685,7 @@ cc_library( deps = [ ":op_stats_to_input_pipeline_analysis", ":op_stats_to_overview_page", + ":op_stats_to_pod_viewer", ":op_stats_to_tf_stats", ":xplane_to_memory_profile", ":xplane_to_op_stats", @@ -693,6 +695,7 @@ cc_library( "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc", "//tensorflow/core/profiler/protobuf:op_stats_proto_cc", "//tensorflow/core/profiler/protobuf:overview_page_proto_cc", + "//tensorflow/core/profiler/protobuf:pod_viewer_proto_cc", "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc", "//tensorflow/core/profiler/protobuf:xplane_proto_cc", "@com_google_absl//absl/strings", diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc index 9ca784c01bb..a6d546b1c4e 100644 --- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc +++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc @@ -201,6 +201,11 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space, *op_stats.mutable_device_op_metrics_db()->mutable_precision_stats() = ComputePrecisionStats(nonoverlapped_step_events); } + + CoreDetails& details = + (*op_stats.mutable_core_id_to_details())[kDefaultGpuLocalCoreId]; + details.set_hostname(space.hostnames().empty() ? "localhost" + : space.hostnames(0)); return op_stats; } diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc index c3ccb73c078..43508706691 100644 --- a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc +++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc @@ -21,6 +21,7 @@ limitations under the License. #include "tensorflow/core/platform/status.h" #include "tensorflow/core/platform/test.h" #include "tensorflow/core/platform/types.h" +#include "tensorflow/core/profiler/convert/step_events_to_steps_db.h" #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h" #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h" #include "tensorflow/core/profiler/protobuf/op_stats.pb.h" @@ -36,6 +37,8 @@ namespace tensorflow { namespace profiler { namespace { +static constexpr char kXPlanePb[] = "xplane.pb"; + TEST(ConvertXPlaneToOpStats, PerfEnv) { XSpace space; constexpr double kMaxError = 0.01; @@ -178,9 +181,20 @@ TEST(ConvertXPlaneToOpStats, PropagateAndDedupErrors) { EXPECT_EQ(kError, op_stats.diagnostics().errors(/*index=*/0)); } +TEST(ConvertXPlaneToOpStats, Hostnames) { + XSpace space; + static constexpr char kHost[] = "host1"; + *space.add_hostnames() = kHost; + + OpStats op_stats = ConvertXSpaceToOpStats(space, OpStatsOptions()); + EXPECT_EQ( + kHost, + op_stats.core_id_to_details().at(kDefaultGpuLocalCoreId).hostname()); +} + // Helper function to build a XSpace and store it to test directory. -void BuildAndStoreXSpaceForTest(Env* test_env, const std::string& test_dir, - const std::string& xspace_name) { +void BuildAndStoreXSpaceForTest(Env* test_env, absl::string_view test_dir, + absl::string_view hostname) { constexpr int64 kStepNum = 123; constexpr int64 kStepId = 456; // Create a host only XSpace for test. @@ -202,6 +216,9 @@ void BuildAndStoreXSpaceForTest(Env* test_env, const std::string& test_dir, CreateXEvent(&host_plane_builder, &executor_thread, "aaa:bbb", 30, 70); GroupTfEvents(&xspace); + xspace.add_hostnames(hostname); + + std::string xspace_name = absl::StrCat(hostname, ".", kXPlanePb); TF_CHECK_OK( WriteBinaryProto(test_env, io::JoinPath(test_dir, xspace_name), xspace)) << "Failed to write binary XSpace to file: " << xspace_name; @@ -214,14 +231,17 @@ TEST(ConvertXPlaneToOpStats, TestConvertMultiXSpacesToCombinedOpStats) { TF_CHECK_OK(test_env->CreateDir(test_dir)) << "Failed to create test directory: " << test_dir; - const std::string xspace1 = "xspace1.pb"; - const std::string xspace2 = "xspace2.pb"; - BuildAndStoreXSpaceForTest(test_env, test_dir, xspace1); - BuildAndStoreXSpaceForTest(test_env, test_dir, xspace2); + static constexpr char kHost1[] = "host1"; + static constexpr char kHost2[] = "host2"; + + BuildAndStoreXSpaceForTest(test_env, test_dir, kHost1); + BuildAndStoreXSpaceForTest(test_env, test_dir, kHost2); std::vector xspace_paths; - xspace_paths.push_back(io::JoinPath(test_dir, xspace1)); - xspace_paths.push_back(io::JoinPath(test_dir, xspace2)); + xspace_paths.push_back( + io::JoinPath(test_dir, absl::StrCat(kHost1, ".", kXPlanePb))); + xspace_paths.push_back( + io::JoinPath(test_dir, absl::StrCat(kHost2, ".", kXPlanePb))); OpStatsOptions options; options.generate_op_metrics_db = true; options.generate_step_db = true; @@ -248,8 +268,13 @@ TEST(ConvertXPlaneToOpStats, TestConvertMultiXSpacesToCombinedOpStats) { const auto& step_info_per_core = combined_op_stats.step_db().step_sequence(0).step_info_per_core(); // global_core_id is computed using: 1000 * host_id + local_core_id. - EXPECT_TRUE(step_info_per_core.contains(1)); - EXPECT_TRUE(step_info_per_core.contains(1001)); + EXPECT_TRUE(step_info_per_core.contains(kDefaultGpuLocalCoreId)); + EXPECT_TRUE(step_info_per_core.contains(1000 + kDefaultGpuLocalCoreId)); + + const auto& core_details_map = combined_op_stats.core_id_to_details(); + EXPECT_EQ(kHost1, core_details_map.at(kDefaultGpuLocalCoreId).hostname()); + EXPECT_EQ(kHost2, + core_details_map.at(1000 + kDefaultGpuLocalCoreId).hostname()); // Tear down environment and directory for testing. int64 undeleted_files, undeleted_dirs; diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc index 59af75109d0..aaa9aca2f3d 100644 --- a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc +++ b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc @@ -22,6 +22,7 @@ limitations under the License. #include "tensorflow/core/platform/logging.h" #include "tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.h" #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h" +#include "tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h" #include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h" #include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h" #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h" @@ -30,6 +31,7 @@ limitations under the License. #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h" #include "tensorflow/core/profiler/protobuf/op_stats.pb.h" #include "tensorflow/core/profiler/protobuf/overview_page.pb.h" +#include "tensorflow/core/profiler/protobuf/pod_viewer.pb.h" #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h" #include "tensorflow/core/profiler/protobuf/xplane.pb.h" @@ -153,6 +155,23 @@ std::pair ConvertXSpaceToMemoryProfile( return std::make_pair(json_output, true); } +std::pair ConvertMultiXSpacesToPodViewer( + const std::vector& xspace_paths) { + OpStatsOptions options; + options.generate_op_metrics_db = true; + options.generate_step_db = true; + OpStats combined_op_stats; + Status status = ConvertMultiXSpacesToCombinedOpStats(xspace_paths, options, + &combined_op_stats); + if (!status.ok()) { + LOG(WARNING) << "Could not generate OpStats for pod_viewer. Error: " + << status.error_message(); + return std::make_pair("", false); + } + return std::make_pair( + ConvertOpStatsToPodViewer(combined_op_stats).SerializeAsString(), true); +} + } // namespace std::pair ConvertMultiXSpacesToToolData( @@ -170,6 +189,8 @@ std::pair ConvertMultiXSpacesToToolData( return ConvertMultiXSpacesToKernelStats(xspace_paths); } else if (tool_name == "memory_profile") { return ConvertXSpaceToMemoryProfile(xspace_paths); + } else if (tool_name == "pod_viewer") { + return ConvertMultiXSpacesToPodViewer(xspace_paths); } else { LOG(WARNING) << "Can not find tool: " << tool_name << ". Please update to " << "the latest version of Tensorflow."; diff --git a/tensorflow/core/profiler/protobuf/xplane.proto b/tensorflow/core/profiler/protobuf/xplane.proto index dd34c2f40b1..f57d7609891 100644 --- a/tensorflow/core/profiler/protobuf/xplane.proto +++ b/tensorflow/core/profiler/protobuf/xplane.proto @@ -5,13 +5,15 @@ package tensorflow.profiler; option cc_enable_arenas = true; // A container of parallel XPlanes, generated by one or more profiling sources. -// Next ID: 4 +// Next ID: 5 message XSpace { repeated XPlane planes = 1; // Errors (if any) in the generation of planes. repeated string errors = 2; // Warnings (if any) in the generation of planes; repeated string warnings = 3; + // List of hostnames that XPlanes are generated from. + repeated string hostnames = 4; } // An XPlane is a container of parallel timelines (XLines), generated by a diff --git a/tensorflow/core/profiler/rpc/profiler_service_impl.cc b/tensorflow/core/profiler/rpc/profiler_service_impl.cc index 54eedb65fa0..e8690f1f1f8 100644 --- a/tensorflow/core/profiler/rpc/profiler_service_impl.cc +++ b/tensorflow/core/profiler/rpc/profiler_service_impl.cc @@ -50,6 +50,7 @@ Status CollectDataToRepository(const ProfileRequest& request, // Read the profile data into xspace. XSpace xspace; TF_RETURN_IF_ERROR(profiler->CollectData(&xspace)); + xspace.add_hostnames(request.host_name()); VLOG(3) << "Collected XSpace to repository."; response->set_empty_trace(IsEmpty(xspace)); diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD index 5adb6d0a4b1..beb0693c80b 100644 --- a/tensorflow/python/profiler/internal/BUILD +++ b/tensorflow/python/profiler/internal/BUILD @@ -120,6 +120,7 @@ tf_python_pybind_extension( ], deps = [ "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", "//tensorflow/core/profiler/convert:xplane_to_tools_data", "//tensorflow/core/profiler/convert:xplane_to_trace_events", "//tensorflow/core/profiler/lib:profiler_session_for_pybind", diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc index f0c289afe01..2f1d29ed334 100644 --- a/tensorflow/python/profiler/internal/profiler_wrapper.cc +++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc @@ -29,6 +29,7 @@ limitations under the License. #include "pybind11/pytypes.h" #include "tensorflow/core/platform/env.h" #include "tensorflow/core/platform/errors.h" +#include "tensorflow/core/platform/host_info.h" #include "tensorflow/core/platform/status.h" #include "tensorflow/core/platform/types.h" #include "tensorflow/core/profiler/convert/xplane_to_tools_data.h" @@ -234,6 +235,7 @@ class ProfilerSessionWrapper { tensorflow::profiler::XSpace xspace; tensorflow::Status status; status = session_->CollectData(&xspace); + xspace.add_hostnames(tensorflow::port::Hostname()); session_.reset(); status = tensorflow::profiler::ExportToTensorBoard(xspace, logdir_); tensorflow::MaybeRaiseRegisteredFromStatus(status);