Merge pull request #37074 from zhuzilin:timeline-dev

PiperOrigin-RevId: 308076685 Change-Id: I036b795320bd67f63bf88c8fbeb48725ab9521df
2020-04-23 10:16:40 -07:00 · 2020-04-23 10:16:40 -07:00 · 958d2c10e6
commit 958d2c10e6
parent 73e688d15a 4abe7b53ba
1 changed files with 104 additions and 7 deletions
--- a/tensorflow/python/client/timeline.py
+++ b/tensorflow/python/client/timeline.py
@ -359,7 +359,8 @@ class Timeline(object):
      graph: (Optional) The 'Graph' that was executed.
    """

-    self._step_stats = step_stats
+    self._origin_step_stats = step_stats
+    self._step_stats = None
    self._graph = graph
    self._chrome_trace = _ChromeTraceFormatter()
    self._next_pid = 0
@ -396,6 +397,17 @@ class Timeline(object):
      inputs = inputs.split(', ')
    return nn, op, inputs

+  def _parse_kernel_label(self, label, node_name):
+    """Parses the fields in a node timeline label."""
+    # Expects labels of the form: retval (arg) detail @@annotation
+    match = re.match(r'.*@@(.*)\#id.*', label)
+    if match is not None:
+      node_name = match.group(1)
+    # Node names should always have the form 'name:op'.
+    fields = node_name.split(':') + ['unknown']
+    name, op = fields[:2]
+    return name, op
+
  def _assign_lanes(self):
    """Assigns non-overlapping lanes for the activities on each device."""
    for device_stats in self._step_stats.dev_stats:
@ -427,9 +439,8 @@ class Timeline(object):
    tid = nodestats.thread_id
    inputs = []
    if is_gputrace:
-      # Node names should always have the form 'name:op'.
-      fields = node_name.split(':') + ['unknown']
-      node_name, op = fields[:2]
+      node_name, op = self._parse_kernel_label(nodestats.timeline_label,
+                                               node_name)
    elif node_name == 'RecvTensor':
      # RPC tracing does not use the standard timeline_label format.
      op = 'RecvTensor'
@ -607,7 +618,81 @@ class Timeline(object):
                                        total_bytes)
    self._allocator_maximums = alloc_maxes

-  def analyze_step_stats(self, show_dataflow=True, show_memory=True):
+  def _preprocess_op_time(self, op_time):
+    """Update the start and end time of ops in step stats.
+
+    Args:
+    op_time: How the execution time of op is shown in timeline. Possible values
+      are "schedule", "gpu" and "all". "schedule" will show op from the time it
+      is scheduled to the end of the scheduling. Notice by the end of its
+      scheduling its async kernels may not start yet. It is shown using the
+      default value from step_stats. "gpu" will show op with the execution time
+      of its kernels on GPU. "all" will show op from the start of its scheduling
+      to the end of its last kernel.
+    """
+    if op_time == 'schedule':
+      self._step_stats = self._origin_step_stats
+      return
+    self._step_stats = copy.deepcopy(self._origin_step_stats)
+    # Separate job task and gpu tracer stream
+    stream_all_stats = []
+    job_stats = []
+    for stats in self._step_stats.dev_stats:
+      if '/stream:all' in stats.device:
+        stream_all_stats.append(stats)
+      elif '/job' in stats.device:
+        job_stats.append(stats)
+
+    # Record the start time of the first kernel and the end time of
+    # the last gpu kernel for all ops.
+    op_gpu_start = {}
+    op_gpu_end = {}
+    for stats in stream_all_stats:
+      for kernel in stats.node_stats:
+        name, _ = self._parse_kernel_label(kernel.timeline_label,
+                                           kernel.node_name)
+        start = kernel.all_start_micros
+        end = kernel.all_start_micros + kernel.all_end_rel_micros
+        if name in op_gpu_start:
+          op_gpu_start[name] = min(op_gpu_start[name], start)
+          op_gpu_end[name] = max(op_gpu_end[name], end)
+        else:
+          op_gpu_start[name] = start
+          op_gpu_end[name] = end
+
+    # Update the start and end time of each op according to the op_time
+    for stats in job_stats:
+      for op in stats.node_stats:
+        if op.node_name in op_gpu_start:
+          end = max(op_gpu_end[op.node_name],
+                    op.all_start_micros + op.all_end_rel_micros)
+          if op_time == 'gpu':
+            op.all_start_micros = op_gpu_start[op.node_name]
+          op.all_end_rel_micros = end - op.all_start_micros
+
+  def analyze_step_stats(self,
+                         show_dataflow=True,
+                         show_memory=True,
+                         op_time='schedule'):
+    """Analyze the step stats and format it into Chrome Trace Format.
+
+    Args:
+      show_dataflow: (Optional.) If True, add flow events to the trace
+        connecting producers and consumers of tensors.
+      show_memory: (Optional.) If True, add object snapshot events to the trace
+        showing the sizes and lifetimes of tensors.
+      op_time: (Optional.) How the execution time of op is shown in timeline.
+        Possible values are "schedule", "gpu" and "all". "schedule" will show op
+        from the time it is scheduled to the end of the scheduling. Notice by
+        the end of its scheduling its async kernels may not start yet. It is
+        shown using the default value from step_stats. "gpu" will show op with
+        the execution time of its kernels on GPU. "all" will show op from the
+        start of its scheduling to the end of its last kernel.
+
+    Returns:
+      A 'StepStatsAnalysis' object.
+    """
+    self._preprocess_op_time(op_time)
    self._allocate_pids()
    self._assign_lanes()
    self._analyze_tensors(show_memory)
@ -618,7 +703,10 @@ class Timeline(object):
        chrome_trace=self._chrome_trace,
        allocator_maximums=self._allocator_maximums)

-  def generate_chrome_trace_format(self, show_dataflow=True, show_memory=False):
+  def generate_chrome_trace_format(self,
+                                   show_dataflow=True,
+                                   show_memory=False,
+                                   op_time='schedule'):
    """Produces a trace in Chrome Trace Format.

    Args:
@ -626,11 +714,20 @@ class Timeline(object):
        connecting producers and consumers of tensors.
      show_memory: (Optional.) If True, add object snapshot events to the trace
        showing the sizes and lifetimes of tensors.
+      op_time: (Optional.) How the execution time of op is shown in timeline.
+        Possible values are "schedule", "gpu" and "all".
+        "schedule" will show op from the time it is scheduled to the end of
+          the scheduling.
+          Notice by the end of its scheduling its async kernels may not start
+          yet. It is shown using the default value from step_stats.
+        "gpu" will show op with the execution time of its kernels on GPU.
+        "all" will show op from the start of its scheduling to the end of
+          its last kernel.

    Returns:
      A JSON formatted string in Chrome Trace format.
    """
    step_stats_analysis = self.analyze_step_stats(
-        show_dataflow=show_dataflow, show_memory=show_memory)
+        show_dataflow=show_dataflow, show_memory=show_memory, op_time=op_time)

    return step_stats_analysis.chrome_trace.format_to_string(pretty=True)