Make step breakdown the same with overview page.
PiperOrigin-RevId: 336917686 Change-Id: If5daa078f8696e9e2a80ed62685605a2bed6495c
This commit is contained in:
parent
c8a9751c55
commit
9031396802
@ -129,6 +129,7 @@ cc_library(
|
||||
copts = tf_profiler_copts(),
|
||||
deps = [
|
||||
"//tensorflow/core:lib",
|
||||
"//tensorflow/core:lib_internal",
|
||||
"//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:pod_stats_proto_cc",
|
||||
"//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
|
||||
|
@ -17,6 +17,7 @@ limitations under the License.
|
||||
|
||||
#include "google/protobuf/any.pb.h"
|
||||
#include "absl/strings/string_view.h"
|
||||
#include "tensorflow/core/lib/gtl/map_util.h"
|
||||
#include "tensorflow/core/platform/logging.h"
|
||||
#include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
|
||||
#include "tensorflow/core/profiler/utils/diagnostics.h"
|
||||
@ -38,14 +39,31 @@ PodStatsRecord CreatePodStatsRecord(absl::string_view host_name,
|
||||
record.set_step_num(step_info.step_num());
|
||||
record.set_total_duration_us(PicosToMicros(step_info.duration_ps()));
|
||||
auto& step_breakdown_map = *record.mutable_step_breakdown_us();
|
||||
std::vector<std::pair<uint64, std::string>> metrics;
|
||||
for (const auto& entry : generic.type_ps()) {
|
||||
step_breakdown_map[entry.first] = PicosToMicros(entry.second);
|
||||
metrics.emplace_back(
|
||||
entry.second, PrintEventTypeLabel(static_cast<EventType>(entry.first)));
|
||||
}
|
||||
std::vector<std::pair<uint64, absl::string_view>> metrics;
|
||||
|
||||
auto add_event = [&](GenericEventType type,
|
||||
std::initializer_list<EventType> event_list) {
|
||||
uint64 ps = 0;
|
||||
for (const auto& event_type : event_list) {
|
||||
ps += gtl::FindWithDefault(generic.type_ps(), event_type, /*value=*/0);
|
||||
}
|
||||
step_breakdown_map[type] = PicosToMicros(ps);
|
||||
metrics.emplace_back(ps, GetGenericEventTypeStr(type));
|
||||
};
|
||||
|
||||
add_event(kDeviceCompute, {DEVICE_COMPUTE_32, DEVICE_COMPUTE_16});
|
||||
add_event(kDeviceToDevice, {DEVICE_TO_DEVICE, DEVICE_WAIT_DEVICE});
|
||||
add_event(kDeviceCollectives, {DEVICE_COLLECTIVES});
|
||||
add_event(kHostCompute, {HOST_COMPUTE});
|
||||
add_event(kHostPrepare, {HOST_PREPARE});
|
||||
add_event(kInput, {HOST_WAIT_INPUT, HOST_TO_DEVICE, DEVICE_WAIT_HOST});
|
||||
add_event(kOutput, {DEVICE_TO_HOST});
|
||||
add_event(kCompile, {HOST_COMPILE});
|
||||
add_event(kAllOthers, {UNKNOWN_TIME});
|
||||
|
||||
std::sort(metrics.begin(), metrics.end());
|
||||
record.set_bottleneck(metrics.back().second);
|
||||
record.set_bottleneck(metrics.back().second.data(),
|
||||
metrics.back().second.size());
|
||||
return record;
|
||||
}
|
||||
|
||||
@ -53,25 +71,14 @@ PodStatsRecord CreatePodStatsRecord(absl::string_view host_name,
|
||||
|
||||
PodStatsDatabase ConvertOpStatsToPodStats(const OpStats& op_stats) {
|
||||
PodStatsDatabase pod_stats_db;
|
||||
auto add_event = [&pod_stats_db](EventType type) {
|
||||
StepBreakdownEvents* event = pod_stats_db.add_step_breakdown_events();
|
||||
event->set_id(type);
|
||||
event->set_name(PrintEventTypeLabel(type));
|
||||
};
|
||||
add_event(HOST_COMPUTE);
|
||||
add_event(HOST_COMPILE);
|
||||
add_event(HOST_TO_HOST);
|
||||
add_event(HOST_TO_DEVICE);
|
||||
add_event(HOST_PREPARE);
|
||||
add_event(DEVICE_COLLECTIVES);
|
||||
add_event(HOST_WAIT_INPUT);
|
||||
add_event(DEVICE_TO_DEVICE);
|
||||
add_event(DEVICE_TO_HOST);
|
||||
add_event(DEVICE_COMPUTE_32);
|
||||
add_event(DEVICE_COMPUTE_16);
|
||||
add_event(DEVICE_WAIT_DEVICE);
|
||||
add_event(DEVICE_WAIT_HOST);
|
||||
add_event(UNKNOWN_TIME);
|
||||
for (int i = GenericEventType::kFirstGenericEventType;
|
||||
i <= GenericEventType::kLastGenericEventType; i++) {
|
||||
auto& event = *pod_stats_db.add_step_breakdown_events();
|
||||
event.set_id(i);
|
||||
absl::string_view type_str =
|
||||
GetGenericEventTypeStr(static_cast<GenericEventType>(i));
|
||||
event.set_name(type_str.data(), type_str.size());
|
||||
}
|
||||
|
||||
for (const auto& step_sequence : op_stats.step_db().step_sequence()) {
|
||||
int count = 0;
|
||||
|
@ -35,9 +35,10 @@ const double kMaxError = 1e-6;
|
||||
constexpr int kStepNum = 2;
|
||||
constexpr int kCoreId = 1;
|
||||
constexpr int kStepTimePs = 1000;
|
||||
constexpr int kHostComputePs = 100;
|
||||
constexpr int kHostComputePs = 50;
|
||||
constexpr int kHostCompilePs = 50;
|
||||
constexpr int kHostToHostPs = 50;
|
||||
constexpr int kHostToDevicePs = 50;
|
||||
constexpr int kHostPreparePs = 50;
|
||||
constexpr int kDeviceCollectivePs = 350;
|
||||
constexpr int kHostWaitInputPs = 50;
|
||||
@ -60,6 +61,7 @@ void CreateOpStats(OpStats* op_stats) {
|
||||
type_ps[HOST_COMPUTE] = kHostComputePs;
|
||||
type_ps[HOST_COMPILE] = kHostCompilePs;
|
||||
type_ps[HOST_TO_HOST] = kHostToHostPs;
|
||||
type_ps[HOST_TO_DEVICE] = kHostToDevicePs;
|
||||
type_ps[HOST_PREPARE] = kHostPreparePs;
|
||||
type_ps[DEVICE_COLLECTIVES] = kDeviceCollectivePs;
|
||||
type_ps[HOST_WAIT_INPUT] = kHostWaitInputPs;
|
||||
@ -83,34 +85,25 @@ TEST(OpStatsToPodStats, GpuPodStats) {
|
||||
EXPECT_NEAR(PicosToMicros(kStepTimePs), record.total_duration_us(),
|
||||
kMaxError);
|
||||
const auto& breakdown = record.step_breakdown_us();
|
||||
EXPECT_NEAR(PicosToMicros(kHostComputePs), breakdown.at(HOST_COMPUTE),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostCompilePs), breakdown.at(HOST_COMPILE),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostToHostPs), breakdown.at(HOST_TO_HOST),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostPreparePs), breakdown.at(HOST_PREPARE),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceCompute32Ps + kDeviceCompute16Ps),
|
||||
breakdown.at(kDeviceCompute), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceToDevicePs + kDeviceWaitDevicePs),
|
||||
breakdown.at(kDeviceToDevice), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceCollectivePs),
|
||||
breakdown.at(DEVICE_COLLECTIVES), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostWaitInputPs), breakdown.at(HOST_WAIT_INPUT),
|
||||
breakdown.at(kDeviceCollectives), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostComputePs), breakdown.at(kHostCompute),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceToDevicePs), breakdown.at(DEVICE_TO_DEVICE),
|
||||
EXPECT_NEAR(PicosToMicros(kHostPreparePs), breakdown.at(kHostPrepare),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceToHostPs), breakdown.at(DEVICE_TO_HOST),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceCompute32Ps),
|
||||
breakdown.at(DEVICE_COMPUTE_32), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceCompute16Ps),
|
||||
breakdown.at(DEVICE_COMPUTE_16), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceWaitDevicePs),
|
||||
breakdown.at(DEVICE_WAIT_DEVICE), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceWaitHostPs), breakdown.at(DEVICE_WAIT_HOST),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kUnknownTimePs), breakdown.at(UNKNOWN_TIME),
|
||||
EXPECT_NEAR(
|
||||
PicosToMicros(kHostWaitInputPs + kHostToDevicePs + kDeviceWaitHostPs),
|
||||
breakdown.at(kInput), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceToHostPs), breakdown.at(kOutput), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostCompilePs), breakdown.at(kCompile), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kUnknownTimePs), breakdown.at(kAllOthers),
|
||||
kMaxError);
|
||||
|
||||
EXPECT_EQ(PrintEventTypeLabel(DEVICE_COLLECTIVES), record.bottleneck());
|
||||
EXPECT_EQ(GetGenericEventTypeStr(kDeviceCollectives), record.bottleneck());
|
||||
}
|
||||
|
||||
TEST(OpStatsToPodStats, Diagnostics) {
|
||||
|
@ -36,9 +36,10 @@ const double kMaxError = 1e-6;
|
||||
constexpr int kStepNum = 2;
|
||||
constexpr int kCoreId = 1;
|
||||
constexpr int kStepTimePs = 1000;
|
||||
constexpr int kHostComputePs = 100;
|
||||
constexpr int kHostComputePs = 50;
|
||||
constexpr int kHostCompilePs = 50;
|
||||
constexpr int kHostToHostPs = 50;
|
||||
constexpr int kHostToDevicePs = 50;
|
||||
constexpr int kHostPreparePs = 50;
|
||||
constexpr int kDeviceCollectivePs = 350;
|
||||
constexpr int kHostWaitInputPs = 50;
|
||||
@ -61,6 +62,7 @@ void CreateOpStats(OpStats* op_stats) {
|
||||
type_ps[HOST_COMPUTE] = kHostComputePs;
|
||||
type_ps[HOST_COMPILE] = kHostCompilePs;
|
||||
type_ps[HOST_TO_HOST] = kHostToHostPs;
|
||||
type_ps[HOST_TO_DEVICE] = kHostToDevicePs;
|
||||
type_ps[HOST_PREPARE] = kHostPreparePs;
|
||||
type_ps[DEVICE_COLLECTIVES] = kDeviceCollectivePs;
|
||||
type_ps[HOST_WAIT_INPUT] = kHostWaitInputPs;
|
||||
@ -87,34 +89,25 @@ TEST(OpStatsToPodViewer, GpuPodViewer) {
|
||||
EXPECT_NEAR(PicosToMicros(kStepTimePs), record.total_duration_us(),
|
||||
kMaxError);
|
||||
const auto& breakdown = record.step_breakdown_us();
|
||||
EXPECT_NEAR(PicosToMicros(kHostComputePs), breakdown.at(HOST_COMPUTE),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostCompilePs), breakdown.at(HOST_COMPILE),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostToHostPs), breakdown.at(HOST_TO_HOST),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostPreparePs), breakdown.at(HOST_PREPARE),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceCompute32Ps + kDeviceCompute16Ps),
|
||||
breakdown.at(kDeviceCompute), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceToDevicePs + kDeviceWaitDevicePs),
|
||||
breakdown.at(kDeviceToDevice), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceCollectivePs),
|
||||
breakdown.at(DEVICE_COLLECTIVES), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostWaitInputPs), breakdown.at(HOST_WAIT_INPUT),
|
||||
breakdown.at(kDeviceCollectives), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostComputePs), breakdown.at(kHostCompute),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceToDevicePs), breakdown.at(DEVICE_TO_DEVICE),
|
||||
EXPECT_NEAR(PicosToMicros(kHostPreparePs), breakdown.at(kHostPrepare),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceToHostPs), breakdown.at(DEVICE_TO_HOST),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceCompute32Ps),
|
||||
breakdown.at(DEVICE_COMPUTE_32), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceCompute16Ps),
|
||||
breakdown.at(DEVICE_COMPUTE_16), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceWaitDevicePs),
|
||||
breakdown.at(DEVICE_WAIT_DEVICE), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceWaitHostPs), breakdown.at(DEVICE_WAIT_HOST),
|
||||
kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kUnknownTimePs), breakdown.at(UNKNOWN_TIME),
|
||||
EXPECT_NEAR(
|
||||
PicosToMicros(kHostWaitInputPs + kHostToDevicePs + kDeviceWaitHostPs),
|
||||
breakdown.at(kInput), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kDeviceToHostPs), breakdown.at(kOutput), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kHostCompilePs), breakdown.at(kCompile), kMaxError);
|
||||
EXPECT_NEAR(PicosToMicros(kUnknownTimePs), breakdown.at(kAllOthers),
|
||||
kMaxError);
|
||||
|
||||
EXPECT_EQ(PrintEventTypeLabel(DEVICE_COLLECTIVES), record.bottleneck());
|
||||
EXPECT_EQ(GetGenericEventTypeStr(kDeviceCollectives), record.bottleneck());
|
||||
}
|
||||
|
||||
TEST(OpStatsToPodViewer, Diagnostics) {
|
||||
|
@ -162,8 +162,35 @@ EventType ClassifyDeviceCompute(absl::string_view event_name,
|
||||
}
|
||||
}
|
||||
|
||||
constexpr int kNumGenericEventTypes = GenericEventType::kLastGenericEventType -
|
||||
GenericEventType::kFirstGenericEventType +
|
||||
1;
|
||||
|
||||
using GenericEventTypeStrMap =
|
||||
absl::flat_hash_map<GenericEventType, absl::string_view>;
|
||||
|
||||
const GenericEventTypeStrMap& GetGenericEventTypeStrMap() {
|
||||
static const auto* generic_event_type_str_map = new GenericEventTypeStrMap({
|
||||
{kDeviceCompute, "Device compute"},
|
||||
{kDeviceToDevice, "Device to device"},
|
||||
{kDeviceCollectives, "Device collective communication"},
|
||||
{kHostCompute, "Host compute"},
|
||||
{kHostPrepare, "Kernel launch"},
|
||||
{kInput, "Input"},
|
||||
{kOutput, "Output"},
|
||||
{kCompile, "Compilation"},
|
||||
{kAllOthers, "All others"},
|
||||
});
|
||||
DCHECK_EQ(generic_event_type_str_map->size(), kNumGenericEventTypes);
|
||||
return *generic_event_type_str_map;
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
absl::string_view GetGenericEventTypeStr(GenericEventType event_type) {
|
||||
return GetGenericEventTypeStrMap().at(event_type);
|
||||
}
|
||||
|
||||
EventType ClassifyGpuEvent(absl::string_view event_name,
|
||||
absl::string_view tensor_shapes) {
|
||||
if (absl::StartsWithIgnoreCase(event_name, "MEMCPYHtoD"))
|
||||
@ -231,42 +258,6 @@ std::string PrintEventType(EventType event_type) {
|
||||
}
|
||||
}
|
||||
|
||||
std::string PrintEventTypeLabel(EventType event_type) {
|
||||
switch (event_type) {
|
||||
case UNKNOWN_TIME:
|
||||
return "Machine idle or unknown events";
|
||||
case HOST_COMPUTE:
|
||||
return "Host compute";
|
||||
case HOST_COMPILE:
|
||||
return "Host compile";
|
||||
case HOST_TO_HOST:
|
||||
return "Host to host";
|
||||
case HOST_TO_DEVICE:
|
||||
return "Host to device";
|
||||
case HOST_PREPARE:
|
||||
return "Host prepare";
|
||||
case DEVICE_COLLECTIVES:
|
||||
return "Device collectives";
|
||||
case HOST_WAIT_INPUT:
|
||||
return "Host wait input";
|
||||
case DEVICE_TO_DEVICE:
|
||||
return "Device to device";
|
||||
case DEVICE_TO_HOST:
|
||||
return "Device to host";
|
||||
case DEVICE_COMPUTE_32:
|
||||
return "Device compute 32-bit";
|
||||
case DEVICE_COMPUTE_16:
|
||||
return "Device compute 16-bit";
|
||||
case DEVICE_WAIT_DEVICE:
|
||||
return "Device wait device";
|
||||
case DEVICE_WAIT_HOST:
|
||||
return "Device wait host";
|
||||
default:
|
||||
DCHECK(false);
|
||||
return "Unknown event type";
|
||||
}
|
||||
}
|
||||
|
||||
std::string PrintEventTypeSpan(const EventTypeSpan& event_type_span) {
|
||||
return absl::StrCat("(", PrintEventType(event_type_span.type), ", ",
|
||||
event_type_span.span.DebugString(), ")");
|
||||
|
@ -68,6 +68,30 @@ enum EventType {
|
||||
LAST_EVENT_TYPE = DEVICE_WAIT_HOST
|
||||
};
|
||||
|
||||
// Generic event types that shown to the user.
|
||||
enum GenericEventType {
|
||||
kFirstGenericEventType = 1,
|
||||
// Device is computing.
|
||||
kDeviceCompute = kFirstGenericEventType,
|
||||
// Device-to-device communication.
|
||||
kDeviceToDevice,
|
||||
// Collective Ops such as All-Reduce and NCCL.
|
||||
kDeviceCollectives,
|
||||
// Host is computing.
|
||||
kHostCompute,
|
||||
// Host is preparing to launch a computation on device.
|
||||
kHostPrepare,
|
||||
// Device waiting for input from the host.
|
||||
kInput,
|
||||
// Device sending output to the host.
|
||||
kOutput,
|
||||
// Host is compling.
|
||||
kCompile,
|
||||
// No recognized event associated with the time.
|
||||
kAllOthers,
|
||||
kLastGenericEventType = kAllOthers,
|
||||
};
|
||||
|
||||
// Contains the type and timespan of an event.
|
||||
struct EventTypeSpan {
|
||||
EventType type; // type of this event.
|
||||
@ -197,8 +221,8 @@ EventType ClassifyGpuEvent(absl::string_view event_name,
|
||||
// Returns the name of the given EventType.
|
||||
std::string PrintEventType(EventType event_type);
|
||||
|
||||
// Returns the label of the given EventType.
|
||||
std::string PrintEventTypeLabel(EventType event_type);
|
||||
// Returns the string of the given GenericEventType.
|
||||
absl::string_view GetGenericEventTypeStr(GenericEventType event_type);
|
||||
|
||||
// Returns a string that prints the given EventTypeSpan.
|
||||
std::string PrintEventTypeSpan(const EventTypeSpan& event_type_span);
|
||||
|
Loading…
x
Reference in New Issue
Block a user