diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD index 59142f87b9f..1ebae5ef3df 100644 --- a/tensorflow/core/profiler/convert/BUILD +++ b/tensorflow/core/profiler/convert/BUILD @@ -27,6 +27,21 @@ cc_library( ], ) +tf_cc_test( + name = "xplane_to_op_metrics_db_test", + size = "small", + srcs = ["xplane_to_op_metrics_db_test.cc"], + deps = [ + ":xplane_to_op_metrics_db", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc", + "//tensorflow/core/profiler/utils:time_utils", + "//tensorflow/core/profiler/utils:xplane_builder", + "//tensorflow/core/profiler/utils:xplane_schema", + ], +) + cc_library( name = "run_metadata_to_trace_events", srcs = ["run_metadata_to_trace_events.cc"], @@ -151,10 +166,31 @@ cc_library( hdrs = ["xplane_to_op_stats.h"], deps = [ ":xplane_to_op_metrics_db", + "//tensorflow/core:lib", + "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc", "//tensorflow/core/profiler/protobuf:op_stats_proto_cc", "//tensorflow/core/profiler/protobuf:xplane_proto_cc", + "//tensorflow/core/profiler/utils:hardware_type_utils", "//tensorflow/core/profiler/utils:xplane_schema", "//tensorflow/core/profiler/utils:xplane_utils", + "//tensorflow/core/profiler/utils:xplane_visitor", + ], +) + +tf_cc_test( + name = "xplane_to_op_stats_test", + size = "small", + srcs = ["xplane_to_op_stats_test.cc"], + deps = [ + ":xplane_to_op_stats", + "//tensorflow/core:lib", + "//tensorflow/core:lib_internal", + "//tensorflow/core:protos_all_cc", + "//tensorflow/core:test", + "//tensorflow/core:test_main", + "//tensorflow/core:testlib", + "//tensorflow/core/profiler/utils:xplane_builder", + "//tensorflow/core/profiler/utils:xplane_schema", ], ) diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc index 9767371b120..1d595561b04 100644 --- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc +++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc @@ -219,6 +219,7 @@ OpMetricsDb ConvertDeviceTraceXPlaneToOpMetricsDb( }); }); result.set_total_time_ps(last_op_offset_ps - first_op_offset_ps); + AddIdleOp(&result); return result; } diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc new file mode 100644 index 00000000000..3c8d5525370 --- /dev/null +++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc @@ -0,0 +1,159 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h" + +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/profiler/protobuf/op_metrics.pb.h" +#include "tensorflow/core/profiler/utils/time_utils.h" +#include "tensorflow/core/profiler/utils/xplane_builder.h" +#include "tensorflow/core/profiler/utils/xplane_schema.h" + +namespace tensorflow { +namespace profiler { +namespace { + +void AddTensorFlowOpEvent(absl::string_view tf_op_fullname, + int64 start_timestamp_ns, int64 duration_ns, + bool on_device, XPlaneBuilder* plane, + XLineBuilder* line) { + XEventBuilder event = + line->AddEvent(*plane->GetOrCreateEventMetadata(tf_op_fullname)); + event.SetTimestampNs(start_timestamp_ns); + event.SetDurationNs(duration_ns); + if (!on_device) return; + event.ParseAndAddStatValue(*plane->GetOrCreateStatMetadata("level 0"), + tf_op_fullname); +} + +void SetXPlaneNameAndId(absl::string_view name, int64 id, + XPlaneBuilder* plane) { + plane->SetName(name); + plane->SetId(id); +} + +TEST(ConvertXPlaneToOpMetricsDb, HostOpMetricsDb) { + static constexpr char TfOp1[] = "TfOp1"; + static constexpr char TfOp2[] = "TfOp2"; + constexpr int64 kTfOp1StartNs = 100000; + constexpr int64 kTfOp1DurationNs = 8000; + constexpr int64 kTfOp2StartNs = 110000; + constexpr int64 kTfOp2DurationNs = 10000; + + XPlane xplane; + XPlaneBuilder host_plane(&xplane); + SetXPlaneNameAndId(kHostThreads, /*id=*/0, &host_plane); + XLineBuilder thread1 = host_plane.GetOrCreateLine(/*line_id=*/10); + AddTensorFlowOpEvent(absl::StrCat(TfOp1, ":", TfOp1), kTfOp1StartNs, + kTfOp1DurationNs, /*on_device=*/false, &host_plane, + &thread1); + XLineBuilder thread2 = host_plane.GetOrCreateLine(/*line_id=*/20); + AddTensorFlowOpEvent(absl::StrCat(TfOp1, ":", TfOp1), kTfOp1StartNs, + kTfOp1DurationNs, /*on_device=*/false, &host_plane, + &thread2); + AddTensorFlowOpEvent(absl::StrCat(TfOp2, ":", TfOp2), kTfOp2StartNs, + kTfOp2DurationNs, /*on_device=*/false, &host_plane, + &thread2); + + OpMetricsDb op_metrics = ConvertHostThreadsXPlaneToOpMetricsDb(xplane); + // Op1, Op2, Idle. + EXPECT_EQ(3, op_metrics.metrics_db_size()); + uint64 total_op_duration = + NanosToPicos(kTfOp1DurationNs * 2 + kTfOp2DurationNs); + EXPECT_EQ(total_op_duration, op_metrics.total_op_time_ps()); + uint64 total_duration = NanosToPicos(kTfOp2StartNs - kTfOp1StartNs + + kTfOp2DurationNs + kTfOp1DurationNs); + EXPECT_EQ(total_duration, op_metrics.total_time_ps()); + + // Verifies OpMetricsDb is built correctly. + const OpMetrics& op_1 = op_metrics.metrics_db().at(0); + EXPECT_EQ(TfOp1, op_1.name()); + EXPECT_EQ(TfOp1, op_1.category()); + EXPECT_EQ(2, op_1.occurrences()); + EXPECT_EQ(NanosToPicos(kTfOp1DurationNs) * 2, op_1.time_ps()); + + const OpMetrics& idle = op_metrics.metrics_db().at(1); + EXPECT_EQ("IDLE", idle.name()); + // Idle time is the gap between Op2 start and the end of Op1, which is 2000ns. + EXPECT_EQ(NanosToPicos(2000), idle.time_ps()); + + const OpMetrics& op_2 = op_metrics.metrics_db().at(2); + EXPECT_EQ(TfOp2, op_2.name()); + EXPECT_EQ(TfOp2, op_2.category()); + EXPECT_EQ(1, op_2.occurrences()); + EXPECT_EQ(NanosToPicos(kTfOp2DurationNs), op_2.time_ps()); +} + +TEST(ConvertXPlaneToOpMetricsDb, DeviceOpMetricsDb) { + static constexpr char TfOp1[] = "TfOp1"; + static constexpr char TfOp2[] = "TfOp2"; + constexpr int64 kTfOp1StartNs = 100000; + constexpr int64 kTfOp1DurationNs = 8000; + constexpr int64 kTfOp2StartNs = 110000; + constexpr int64 kTfOp2DurationNs = 10000; + + XPlane xplane; + XPlaneBuilder device_plane(&xplane); + SetXPlaneNameAndId(absl::StrCat(kGpuPlanePrefix, ":0"), /*id=*/1, + &device_plane); + XLineBuilder stream1 = device_plane.GetOrCreateLine(/*line_id=*/10); + AddTensorFlowOpEvent(absl::StrCat(TfOp1, ":", TfOp1), kTfOp1StartNs, + kTfOp1DurationNs, /*on_device=*/true, &device_plane, + &stream1); + XLineBuilder stream2 = device_plane.GetOrCreateLine(/*line_id=*/20); + AddTensorFlowOpEvent(absl::StrCat(TfOp1, ":", TfOp1), kTfOp1StartNs, + kTfOp1DurationNs, /*on_device=*/true, &device_plane, + &stream2); + AddTensorFlowOpEvent(absl::StrCat(TfOp2, ":", TfOp2), kTfOp2StartNs, + kTfOp2DurationNs, /*on_device=*/true, &device_plane, + &stream2); + + OpMetricsDb op_metrics = ConvertDeviceTraceXPlaneToOpMetricsDb( + xplane, /*peak_tera_flops_per_second=*/0, + /*peak_hbm_bw_giga_bytes_per_second=*/0); + + // Op1, Op2, Idle. + EXPECT_EQ(3, op_metrics.metrics_db_size()); + uint64 total_op_duration = + NanosToPicos(kTfOp1DurationNs * 2 + kTfOp2DurationNs); + EXPECT_EQ(total_op_duration, op_metrics.total_op_time_ps()); + // For device, the total_duration for each device is the total duration merged + // from all GPU streams, which is from 100000 to 120000. + uint64 total_duration = + NanosToPicos(kTfOp2StartNs + kTfOp2DurationNs - kTfOp1StartNs); + EXPECT_EQ(total_duration, op_metrics.total_time_ps()); + + // Verifies OpMetricsDb is built correctly. + const OpMetrics& op_1 = op_metrics.metrics_db().at(0); + EXPECT_EQ(TfOp1, op_1.name()); + EXPECT_EQ(TfOp1, op_1.category()); + EXPECT_EQ(2, op_1.occurrences()); + EXPECT_EQ(NanosToPicos(kTfOp1DurationNs) * 2, op_1.time_ps()); + + const OpMetrics& op_2 = op_metrics.metrics_db().at(1); + EXPECT_EQ(TfOp2, op_2.name()); + EXPECT_EQ(TfOp2, op_2.category()); + EXPECT_EQ(1, op_2.occurrences()); + EXPECT_EQ(NanosToPicos(kTfOp2DurationNs), op_2.time_ps()); + + const OpMetrics& idle = op_metrics.metrics_db().at(2); + EXPECT_EQ("IDLE", idle.name()); + // GPU is always busy in this example. + EXPECT_EQ(NanosToPicos(0), idle.time_ps()); +} + +} // namespace +} // namespace profiler +} // namespace tensorflow diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc index 77346565742..2b5441e3388 100644 --- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc +++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc @@ -15,19 +15,77 @@ limitations under the License. #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h" +#include "tensorflow/core/platform/types.h" #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h" +#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h" +#include "tensorflow/core/profiler/utils/hardware_type_utils.h" #include "tensorflow/core/profiler/utils/xplane_schema.h" #include "tensorflow/core/profiler/utils/xplane_utils.h" +#include "tensorflow/core/profiler/utils/xplane_visitor.h" namespace tensorflow { namespace profiler { +namespace { + +DeviceCapabilities GetDeviceCapFromXPlane(const XPlane& device_plane) { + DeviceCapabilities cap; + XPlaneVisitor plane(&device_plane); + if (auto clock_rate_khz = plane.GetStats(kDevCapClockRateKHz)) { + cap.set_clock_rate_in_ghz(clock_rate_khz->int64_value() / 1000000.0); + } + if (auto core_count = plane.GetStats(kDevCapCoreCount)) { + cap.set_num_cores(core_count->int64_value()); + } + // Set memory bandwidth in bytes/s. + if (auto memory_bw = plane.GetStats(kDevCapMemoryBandwidth)) { + cap.set_memory_bandwidth(memory_bw->int64_value()); + } + if (auto memory_size_in_bytes = plane.GetStats(kDevCapMemorySize)) { + cap.set_memory_size_in_bytes(memory_size_in_bytes->uint64_value()); + } + if (auto cap_major = plane.GetStats(kDevCapComputeCapMajor)) { + cap.mutable_compute_capability()->set_major(cap_major->int64_value()); + } + if (auto cap_minor = plane.GetStats(kDevCapComputeCapMinor)) { + cap.mutable_compute_capability()->set_minor(cap_minor->int64_value()); + } + return cap; +} + +PerfEnv GetPerfEnvFromXPlane(const XPlane& device_plane) { + PerfEnv result; + DeviceCapabilities cap = GetDeviceCapFromXPlane(device_plane); + result.set_peak_tera_flops_per_second(GetFlopMaxThroughputPerSM(cap) / 1000 * + cap.num_cores()); + result.set_peak_hbm_bw_giga_bytes_per_second(cap.memory_bandwidth() / 1e9); + result.set_ridge_point(result.peak_tera_flops_per_second() * 1000 / + result.peak_hbm_bw_giga_bytes_per_second()); + return result; +} + +} // namespace OpStats ConvertXSpaceToOpStats(const XSpace& space) { OpStats op_stats; + // Hosts. if (const XPlane* host_trace = FindPlaneWithName(space, kHostThreads)) { *op_stats.mutable_host_op_metrics_db() = ConvertHostThreadsXPlaneToOpMetricsDb(*host_trace); } + // Device. + OpMetricsDbCombiner op_metrics_db_combiner( + op_stats.mutable_device_op_metrics_db()); + for (const XPlane* device_trace : + FindPlanesWithPrefix(space, kGpuPlanePrefix)) { + if (!op_stats.has_perf_env()) { + *op_stats.mutable_perf_env() = GetPerfEnvFromXPlane(*device_trace); + } + const PerfEnv& perf_env = op_stats.perf_env(); + OpMetricsDb device_op_metrics_db = ConvertDeviceTraceXPlaneToOpMetricsDb( + *device_trace, perf_env.peak_tera_flops_per_second(), + perf_env.peak_hbm_bw_giga_bytes_per_second()); + op_metrics_db_combiner.Combine(device_op_metrics_db); + } return op_stats; } diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc new file mode 100644 index 00000000000..26b62490e79 --- /dev/null +++ b/tensorflow/core/profiler/convert/xplane_to_op_stats_test.cc @@ -0,0 +1,63 @@ +/* Copyright 2020 The TensorFlow Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +==============================================================================*/ + +#include "tensorflow/core/profiler/convert/xplane_to_op_stats.h" + +#include "tensorflow/core/platform/test.h" +#include "tensorflow/core/profiler/utils/xplane_builder.h" +#include "tensorflow/core/profiler/utils/xplane_schema.h" + +namespace tensorflow { +namespace profiler { +namespace { + +TEST(ConvertXPlaneToOpStats, PerfEnv) { + XSpace xspace; + constexpr double kMaxError = 0.01; + constexpr int kClockRateKHz = 1530000; + constexpr int kCoreCount = 80; + constexpr uint64 kMemoryBandwidthBytesPerSecond = 900 * 1e9; + // Volta. + constexpr int kComputeCapMajor = 7; + constexpr int kComputeCapMinor = 0; + + XPlaneBuilder device_plane(xspace.add_planes()); + device_plane.SetName(absl::StrCat(kGpuPlanePrefix, ":0")); + device_plane.ParseAndAddStatValue( + *device_plane.GetOrCreateStatMetadata("clock_rate"), + absl::StrCat(kClockRateKHz)); + device_plane.ParseAndAddStatValue( + *device_plane.GetOrCreateStatMetadata("core_count"), + absl::StrCat(kCoreCount)); + device_plane.ParseAndAddStatValue( + *device_plane.GetOrCreateStatMetadata("memory_bandwidth"), + absl::StrCat(kMemoryBandwidthBytesPerSecond)); + device_plane.ParseAndAddStatValue( + *device_plane.GetOrCreateStatMetadata("compute_cap_major"), + absl::StrCat(kComputeCapMajor)); + device_plane.ParseAndAddStatValue( + *device_plane.GetOrCreateStatMetadata("compute_cap_minor"), + absl::StrCat(kComputeCapMinor)); + + OpStats op_stats = ConvertXSpaceToOpStats(xspace); + const PerfEnv& perf_env = op_stats.perf_env(); + EXPECT_NEAR(141, perf_env.peak_tera_flops_per_second(), kMaxError); + EXPECT_NEAR(900, perf_env.peak_hbm_bw_giga_bytes_per_second(), kMaxError); + EXPECT_NEAR(156.67, perf_env.ridge_point(), kMaxError); +} + +} // namespace +} // namespace profiler +} // namespace tensorflow diff --git a/tensorflow/core/profiler/protobuf/hardware_types.proto b/tensorflow/core/profiler/protobuf/hardware_types.proto index 0538ee0b056..66d31783788 100644 --- a/tensorflow/core/profiler/protobuf/hardware_types.proto +++ b/tensorflow/core/profiler/protobuf/hardware_types.proto @@ -25,6 +25,6 @@ message DeviceCapabilities { double clock_rate_in_ghz = 1; uint32 num_cores = 2; uint64 memory_size_in_bytes = 3; - uint64 memory_bandwidth = 4; + uint64 memory_bandwidth = 4; // Bytes/s. CudaComputeCapability compute_capability = 5; } diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc index 47bc798b93c..d59bc1a03dd 100644 --- a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc +++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc @@ -76,6 +76,7 @@ double IdleTimeRatio(const OpMetricsDb& metrics_db) { } uint64 IdleTimePs(const OpMetricsDb& metrics_db) { + if (metrics_db.total_time_ps() <= metrics_db.total_op_time_ps()) return 0; return metrics_db.total_time_ps() - metrics_db.total_op_time_ps(); } diff --git a/tensorflow/core/profiler/utils/xplane_utils.cc b/tensorflow/core/profiler/utils/xplane_utils.cc index 8194f041044..a2c2f93d665 100644 --- a/tensorflow/core/profiler/utils/xplane_utils.cc +++ b/tensorflow/core/profiler/utils/xplane_utils.cc @@ -14,6 +14,8 @@ limitations under the License. ==============================================================================*/ #include "tensorflow/core/profiler/utils/xplane_utils.h" +#include "absl/strings/match.h" + namespace tensorflow { namespace profiler { @@ -24,6 +26,15 @@ const XPlane* FindPlaneWithName(const XSpace& space, absl::string_view name) { return nullptr; } +std::vector FindPlanesWithPrefix(const XSpace& space, + absl::string_view prefix) { + std::vector result; + for (const XPlane& plane : space.planes()) { + if (absl::StartsWith(plane.name(), prefix)) result.push_back(&plane); + } + return result; +} + XPlane* GetOrCreatePlane(XSpace* space, absl::string_view name) { for (XPlane& plane : *space->mutable_planes()) { if (plane.name() == name) return &plane; diff --git a/tensorflow/core/profiler/utils/xplane_utils.h b/tensorflow/core/profiler/utils/xplane_utils.h index 86583bb3634..0a468cc0d58 100644 --- a/tensorflow/core/profiler/utils/xplane_utils.h +++ b/tensorflow/core/profiler/utils/xplane_utils.h @@ -15,6 +15,8 @@ limitations under the License. #ifndef TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_UTILS_H_ #define TENSORFLOW_CORE_PROFILER_UTILS_XPLANE_UTILS_H_ +#include + #include "absl/strings/string_view.h" #include "tensorflow/core/profiler/protobuf/xplane.pb.h" @@ -24,6 +26,10 @@ namespace profiler { // Returns the plane with the given name or nullptr if not found. const XPlane* FindPlaneWithName(const XSpace& space, absl::string_view name); +// Returns all the planes with a given prefix. +std::vector FindPlanesWithPrefix(const XSpace& space, + absl::string_view prefix); + // Returns the plane with the given name, create it if necessary. XPlane* GetOrCreatePlane(XSpace* space, absl::string_view name);