Open source pod_viewer.proto and pod_stats.proto.
PiperOrigin-RevId: 335301550 Change-Id: Id97e0c5b3a2317b8871f6b5a72786fac26face28
This commit is contained in:
parent
9439a14120
commit
e79c0b68e7
@ -65,6 +65,28 @@ tf_proto_library(
|
||||
visibility = [":friends"],
|
||||
)
|
||||
|
||||
tf_proto_library(
|
||||
name = "pod_stats_proto",
|
||||
srcs = ["pod_stats.proto"],
|
||||
cc_api_version = 2,
|
||||
protodeps = [
|
||||
":diagnostics_proto",
|
||||
],
|
||||
visibility = [":friends"],
|
||||
)
|
||||
|
||||
tf_proto_library(
|
||||
name = "pod_viewer_proto",
|
||||
srcs = ["pod_viewer.proto"],
|
||||
cc_api_version = 2,
|
||||
protodeps = [
|
||||
":diagnostics_proto",
|
||||
":pod_stats_proto",
|
||||
":op_stats_proto",
|
||||
],
|
||||
visibility = [":friends"],
|
||||
)
|
||||
|
||||
tf_proto_library(
|
||||
name = "steps_db_proto",
|
||||
srcs = ["steps_db.proto"],
|
||||
|
41
tensorflow/core/profiler/protobuf/pod_stats.proto
Normal file
41
tensorflow/core/profiler/protobuf/pod_stats.proto
Normal file
@ -0,0 +1,41 @@
|
||||
syntax = "proto3";
|
||||
|
||||
package tensorflow.profiler;
|
||||
|
||||
import "tensorflow/core/profiler/protobuf/diagnostics.proto";
|
||||
|
||||
message StepBreakdownEvents {
|
||||
int32 id = 1;
|
||||
string name = 2;
|
||||
}
|
||||
|
||||
// A database of PodStats records.
|
||||
message PodStatsDatabase {
|
||||
// All PodStats records, one for each row in the PodStats tool.
|
||||
repeated PodStatsRecord pod_stats_record = 1;
|
||||
// Error and warning messages for diagnosing profiling issues.
|
||||
Diagnostics diagnostics = 3;
|
||||
// A map from event type number to event name string for step breakdown.
|
||||
repeated StepBreakdownEvents step_breakdown_events = 4;
|
||||
reserved 2;
|
||||
}
|
||||
|
||||
// Next ID: 20
|
||||
// There is one PodStatsRecord for each step traced on each compute node.
|
||||
message PodStatsRecord {
|
||||
// The host name where the trace was collected.
|
||||
string host_name = 1;
|
||||
// The TPU global chip id where the trace was collected.
|
||||
int32 chip_id = 2;
|
||||
// The TPU node id where the trace was collected.
|
||||
int32 node_id = 3;
|
||||
// The step number.
|
||||
uint32 step_num = 4;
|
||||
// The step duration in micro-seconds.
|
||||
double total_duration_us = 5;
|
||||
// Breakdown the durations for each event type in micro-seconds.
|
||||
map<int32, double> step_breakdown_us = 19;
|
||||
// Indicates the bottleneck out of the above mentioned metrics.
|
||||
string bottleneck = 14;
|
||||
reserved 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18;
|
||||
}
|
112
tensorflow/core/profiler/protobuf/pod_viewer.proto
Normal file
112
tensorflow/core/profiler/protobuf/pod_viewer.proto
Normal file
@ -0,0 +1,112 @@
|
||||
// This proto describes the format of the output profile file from
|
||||
// the Pod Viewer tool.
|
||||
syntax = "proto3";
|
||||
|
||||
package tensorflow.profiler;
|
||||
|
||||
import "tensorflow/core/profiler/protobuf/diagnostics.proto";
|
||||
import "tensorflow/core/profiler/protobuf/op_stats.proto";
|
||||
import "tensorflow/core/profiler/protobuf/pod_stats.proto";
|
||||
|
||||
// Describes the replica groups in a cross replica op (e.g., all-reduce and
|
||||
// all-to-all).
|
||||
message ReplicaGroup {
|
||||
// The ids of the replicas that belongs to the same group. The ordering of the
|
||||
// ids matters in some ops (e.g., all-to-all).
|
||||
repeated int64 replica_ids = 1;
|
||||
}
|
||||
|
||||
message AllReduceOpInfo {
|
||||
// Name of this OP.
|
||||
string name = 1;
|
||||
// Number of instances that this OP occurred.
|
||||
uint32 occurrences = 2;
|
||||
// The time in microseconds spent in this OP (averaged across all of its
|
||||
// occurrences).
|
||||
double duration_us = 3;
|
||||
// Byte size of data transferred.
|
||||
uint64 data_size = 4;
|
||||
// Replica groups.
|
||||
repeated ReplicaGroup replica_groups = 5;
|
||||
// Description (e.g. XLA expression).
|
||||
string description = 6;
|
||||
}
|
||||
|
||||
// Result proto for information in a step across all cores.
|
||||
message PodStatsMap {
|
||||
// The (micro) step number.
|
||||
uint32 step_num = 1;
|
||||
// A map from core_id to PodStatsRecord.
|
||||
map<uint32, PodStatsRecord> pod_stats_per_core = 2;
|
||||
// A database of channel info.
|
||||
repeated ChannelInfo channel_db = 3;
|
||||
// A map from core ID to program replica id. Replica id map could change
|
||||
// during a profile session, but should stay stable within a step.
|
||||
map<uint32, uint32> core_id_to_replica_id_map = 4;
|
||||
// A database of all reduce ops.
|
||||
repeated AllReduceOpInfo all_reduce_op_db = 5;
|
||||
}
|
||||
|
||||
// A sequence of PodStatsMap for each step.
|
||||
message PodStatsSequence {
|
||||
repeated PodStatsMap pod_stats_map = 1;
|
||||
}
|
||||
|
||||
// Next ID: 14
|
||||
// Information about a send and recv channel.
|
||||
message ChannelInfo {
|
||||
// Id of the channel.
|
||||
int64 channel_id = 1;
|
||||
// Core ids of send ops.
|
||||
repeated uint32 src_core_ids = 11;
|
||||
// Core ids of recv ops.
|
||||
repeated uint32 dst_core_ids = 12;
|
||||
// Byte size of the data transferred.
|
||||
uint64 data_size = 4;
|
||||
// Duration from the beginning of send to the end of recv-done in
|
||||
// microseconds.
|
||||
double duration_us = 5;
|
||||
// Number of occurrences of a channel.
|
||||
uint32 occurrences = 6;
|
||||
// Percentage of the link BW utilized over the peak link BW.
|
||||
double utilization = 7;
|
||||
// A list of hlo names associated with this channel id.
|
||||
repeated string hlo_names = 8;
|
||||
// Duration from the beginning of the recv-done to the beginning of send in
|
||||
// microseconds. If the recv-done op starts after the beginning of the send
|
||||
// op, the delay is zero.
|
||||
double send_delay_us = 9;
|
||||
// Description (e.g. XLA expression).
|
||||
string description = 13;
|
||||
|
||||
reserved 2, 3, 10;
|
||||
}
|
||||
|
||||
// Run environment of the job.
|
||||
message PodViewerRunEnvironment {
|
||||
// The type of TPU used.
|
||||
string tpu_type = 1;
|
||||
// Pod system topology.
|
||||
SystemTopology topology = 2;
|
||||
}
|
||||
|
||||
message PodViewerSummary {
|
||||
repeated string warnings = 1;
|
||||
}
|
||||
|
||||
// Next ID: 10
|
||||
// A database of pod viewer records.
|
||||
message PodViewerDatabase {
|
||||
// Pod level stats for each step.
|
||||
PodStatsSequence pod_stats_sequence = 3;
|
||||
// Job run environment, including number of hosts used, device type, etc.
|
||||
PodViewerRunEnvironment run_environment = 6;
|
||||
// Top level summary of pod viewer.
|
||||
PodViewerSummary summary = 7;
|
||||
// Error and warning messages for diagnosing profiling issues.
|
||||
Diagnostics diagnostics = 8;
|
||||
// A map from event type number to event name string for step breakdown.
|
||||
repeated StepBreakdownEvents step_breakdown_events = 9;
|
||||
|
||||
reserved 1, 2, 4, 5;
|
||||
}
|
Loading…
x
Reference in New Issue
Block a user