Open source pod_viewer.proto and pod_stats.proto.

PiperOrigin-RevId: 335301550
Change-Id: Id97e0c5b3a2317b8871f6b5a72786fac26face28
This commit is contained in:
A. Unique TensorFlower 2020-10-04 10:57:18 -07:00 committed by TensorFlower Gardener
parent 9439a14120
commit e79c0b68e7
3 changed files with 175 additions and 0 deletions

View File

@ -65,6 +65,28 @@ tf_proto_library(
visibility = [":friends"],
)
tf_proto_library(
name = "pod_stats_proto",
srcs = ["pod_stats.proto"],
cc_api_version = 2,
protodeps = [
":diagnostics_proto",
],
visibility = [":friends"],
)
tf_proto_library(
name = "pod_viewer_proto",
srcs = ["pod_viewer.proto"],
cc_api_version = 2,
protodeps = [
":diagnostics_proto",
":pod_stats_proto",
":op_stats_proto",
],
visibility = [":friends"],
)
tf_proto_library(
name = "steps_db_proto",
srcs = ["steps_db.proto"],

View File

@ -0,0 +1,41 @@
syntax = "proto3";
package tensorflow.profiler;
import "tensorflow/core/profiler/protobuf/diagnostics.proto";
message StepBreakdownEvents {
int32 id = 1;
string name = 2;
}
// A database of PodStats records.
message PodStatsDatabase {
// All PodStats records, one for each row in the PodStats tool.
repeated PodStatsRecord pod_stats_record = 1;
// Error and warning messages for diagnosing profiling issues.
Diagnostics diagnostics = 3;
// A map from event type number to event name string for step breakdown.
repeated StepBreakdownEvents step_breakdown_events = 4;
reserved 2;
}
// Next ID: 20
// There is one PodStatsRecord for each step traced on each compute node.
message PodStatsRecord {
// The host name where the trace was collected.
string host_name = 1;
// The TPU global chip id where the trace was collected.
int32 chip_id = 2;
// The TPU node id where the trace was collected.
int32 node_id = 3;
// The step number.
uint32 step_num = 4;
// The step duration in micro-seconds.
double total_duration_us = 5;
// Breakdown the durations for each event type in micro-seconds.
map<int32, double> step_breakdown_us = 19;
// Indicates the bottleneck out of the above mentioned metrics.
string bottleneck = 14;
reserved 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18;
}

View File

@ -0,0 +1,112 @@
// This proto describes the format of the output profile file from
// the Pod Viewer tool.
syntax = "proto3";
package tensorflow.profiler;
import "tensorflow/core/profiler/protobuf/diagnostics.proto";
import "tensorflow/core/profiler/protobuf/op_stats.proto";
import "tensorflow/core/profiler/protobuf/pod_stats.proto";
// Describes the replica groups in a cross replica op (e.g., all-reduce and
// all-to-all).
message ReplicaGroup {
// The ids of the replicas that belongs to the same group. The ordering of the
// ids matters in some ops (e.g., all-to-all).
repeated int64 replica_ids = 1;
}
message AllReduceOpInfo {
// Name of this OP.
string name = 1;
// Number of instances that this OP occurred.
uint32 occurrences = 2;
// The time in microseconds spent in this OP (averaged across all of its
// occurrences).
double duration_us = 3;
// Byte size of data transferred.
uint64 data_size = 4;
// Replica groups.
repeated ReplicaGroup replica_groups = 5;
// Description (e.g. XLA expression).
string description = 6;
}
// Result proto for information in a step across all cores.
message PodStatsMap {
// The (micro) step number.
uint32 step_num = 1;
// A map from core_id to PodStatsRecord.
map<uint32, PodStatsRecord> pod_stats_per_core = 2;
// A database of channel info.
repeated ChannelInfo channel_db = 3;
// A map from core ID to program replica id. Replica id map could change
// during a profile session, but should stay stable within a step.
map<uint32, uint32> core_id_to_replica_id_map = 4;
// A database of all reduce ops.
repeated AllReduceOpInfo all_reduce_op_db = 5;
}
// A sequence of PodStatsMap for each step.
message PodStatsSequence {
repeated PodStatsMap pod_stats_map = 1;
}
// Next ID: 14
// Information about a send and recv channel.
message ChannelInfo {
// Id of the channel.
int64 channel_id = 1;
// Core ids of send ops.
repeated uint32 src_core_ids = 11;
// Core ids of recv ops.
repeated uint32 dst_core_ids = 12;
// Byte size of the data transferred.
uint64 data_size = 4;
// Duration from the beginning of send to the end of recv-done in
// microseconds.
double duration_us = 5;
// Number of occurrences of a channel.
uint32 occurrences = 6;
// Percentage of the link BW utilized over the peak link BW.
double utilization = 7;
// A list of hlo names associated with this channel id.
repeated string hlo_names = 8;
// Duration from the beginning of the recv-done to the beginning of send in
// microseconds. If the recv-done op starts after the beginning of the send
// op, the delay is zero.
double send_delay_us = 9;
// Description (e.g. XLA expression).
string description = 13;
reserved 2, 3, 10;
}
// Run environment of the job.
message PodViewerRunEnvironment {
// The type of TPU used.
string tpu_type = 1;
// Pod system topology.
SystemTopology topology = 2;
}
message PodViewerSummary {
repeated string warnings = 1;
}
// Next ID: 10
// A database of pod viewer records.
message PodViewerDatabase {
// Pod level stats for each step.
PodStatsSequence pod_stats_sequence = 3;
// Job run environment, including number of hosts used, device type, etc.
PodViewerRunEnvironment run_environment = 6;
// Top level summary of pod viewer.
PodViewerSummary summary = 7;
// Error and warning messages for diagnosing profiling issues.
Diagnostics diagnostics = 8;
// A map from event type number to event name string for step breakdown.
repeated StepBreakdownEvents step_breakdown_events = 9;
reserved 1, 2, 4, 5;
}