From 93d9e8251f81a2fec1d684a80f43b9f8262b3380 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Wed, 16 Dec 2020 13:14:04 +0100
Subject: [PATCH] feat: perf-pyrecord and perf-pyscript to inject python
 stacktraces

---
 per4m/__main__.py   |  10 ++-
 per4m/example4.py   |  15 ++++
 per4m/giltracer.py  |  73 ++--------------
 per4m/perf2trace.py |  22 +----
 per4m/perfutils.py  |  34 ++++++++
 per4m/record.py     | 162 +++++++++++++++++++++++++++++++++++
 per4m/script.py     | 201 ++++++++++++++++++++++++++++++++++++++++++++
 setup.py            |   9 +-
 8 files changed, 438 insertions(+), 88 deletions(-)
 create mode 100644 per4m/example4.py
 create mode 100644 per4m/perfutils.py
 create mode 100644 per4m/record.py
 create mode 100644 per4m/script.py

diff --git a/per4m/__main__.py b/per4m/__main__.py
index 25f179e..7ea0d98 100644
--- a/per4m/__main__.py
+++ b/per4m/__main__.py
@@ -7,8 +7,10 @@ optional arguments:
   -h, --help          show this help message and exit
 
 positional arguments:
-    perf2trace          convert perf.data to TraceEvent json data
     giltracer           Run VizTracer and perf, and merge the result to see where the GIL is active.
+    record              Run VizTracer and perf simultaneously. See also man perf record.
+    script              Take stacktraces from VizTracer, and inject them in perf script output.
+    perf2trace          Convert perf.data to TraceEvent JSON data.
 
 Examples:
 $ perf script --no-inline | per4m -v
@@ -26,6 +28,12 @@ def main(args=sys.argv):
     elif len(args) > 1 and args[1] == "giltracer":
         from .giltracer import main
         main([os.path.basename(args[0]) + " " + args[1]] + args[2:])
+    elif len(args) > 1 and args[1] == "record":
+        from .record import main
+        main([os.path.basename(args[0]) + " " + args[1]] + args[2:])
+    elif len(args) > 1 and args[1] == "script":
+        from .script import main
+        main([os.path.basename(args[0]) + " " + args[1]] + args[2:])
     else:
         print(usage)
         sys.exit(0)
diff --git a/per4m/example4.py b/per4m/example4.py
new file mode 100644
index 0000000..6bca538
--- /dev/null
+++ b/per4m/example4.py
@@ -0,0 +1,15 @@
+import time
+
+values = [('foo', 3), ('bar', 42), ('bazz', 2)]
+
+def key_function(item):
+    time.sleep(0.01)
+    name, index = item
+    return index
+
+def do_sort():
+    values.sort(key=key_function)
+do_sort()
+def do_print():
+    print(values)
+do_print()
diff --git a/per4m/giltracer.py b/per4m/giltracer.py
index 2589a40..16a5a8d 100644
--- a/per4m/giltracer.py
+++ b/per4m/giltracer.py
@@ -1,89 +1,32 @@
 import argparse
 import os
-import shlex
-import subprocess
 import sys
-import time
-import signal
-
 
 import viztracer
 from viztracer.report_builder import ReportBuilder
 
 import runpy
-
-
-RETRIES = 10
+from .record import PerfRecord
 
 
 usage = """
 
-Convert perf.data to Trace Event Format.
-
+Run VizTracer and perf simultaneously, then inject the GIL information in the viztracer output
 Usage:
 
-$ per4m giltracer -m per4m.example1
+$ giltracer -m per4m.example1
 """
 
-class GilTracer:
-    def __init__(self, perf_output='perf.data', trace_output='giltracer.json', verbose=1):
-        self.perf_output = perf_output
+class GilTracer(PerfRecord):
+    def __init__(self, output='perf.data', trace_output='giltracer.json', verbose=1):
+        super().__init__(output=output, verbose=verbose, args=["-e 'sched:*'"])
         self.trace_output = trace_output
         self.verbose = verbose
 
-    def __enter__(self):
-        pid = os.getpid()
-        cmd = f"perf record -e 'sched:*' --call-graph dwarf -k CLOCK_MONOTONIC --pid {pid} -o {self.perf_output}"
-        if self.verbose >= 2:
-            print(f"Running: {cmd}")
-        args = shlex.split(cmd)
-        self.perf = subprocess.Popen(args, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
-        start_time = time.time()
-        for _ in range(RETRIES):
-            if os.path.exists(self.perf_output):
-                mtime = os.path.getmtime(self.perf_output)
-                if mtime > start_time:
-                    break
-            # we need to wait till perf creates the file
-            time.sleep(0.1)
-        else:
-            self.finish()
-            raise OSError(f'perf did not create {self.perf_output}')
-        start_size = os.path.getsize(self.perf_output)
-        for _ in range(RETRIES):
-            size = os.path.getsize(self.perf_output)
-            if size > start_size:
-                break
-            # we need to wait till perf writes
-            time.sleep(0.1)
-        else:
-            self.finish()
-            raise OSError(f'perf did not write to {self.perf_output}')
-        # and give perf a bit more time
-        time.sleep(0.05)
-        return self
-
-    def finish(self):
-        self.perf.terminate()
-        outs, errs = self.perf.communicate(timeout=5)
-        if self.verbose >= 1:
-            print(outs.decode('utf8'))
-        if errs:
-            if self.verbose >= 1:
-                print(errs.decode('utf8'))
-        if self.perf.returncode not in [0, -signal.SIGTERM.value]:
-            print(signal.SIGTERM)
-            raise OSError(f'perf record fails, got exit code {self.perf.returncode}')
-
     def __exit__(self, *args):
-        self.finish()
-        if self.verbose >= 1:
-            print('Wait for perf to finish...')
-        self.perf.wait()
-
-        pid = os.getpid()
+        super().__exit__(self, *args)
         verbose = '-q ' + '-v ' * self.verbose
-        cmd = f"perf script -i {self.perf_output} --no-inline | per4m perf2trace -o {self.trace_output} {verbose}"
+        cmd = f"perf script -i {self.output} --no-inline | per4m perf2trace -o {self.trace_output} {verbose}"
         if os.system(cmd) != 0:
             raise OSError(f'Failed to run perf or per4m perf2trace, command:\n$ {cmd}')
 
diff --git a/per4m/perf2trace.py b/per4m/perf2trace.py
index b49c533..e3d2abb 100644
--- a/per4m/perf2trace.py
+++ b/per4m/perf2trace.py
@@ -1,25 +1,7 @@
 import sys
 import json
 import argparse
-
-
-def read_events(input):
-    first_line = True
-    stacktrace = []
-    header = None
-    for line in sys.stdin:
-        line = line.strip()
-        if first_line:
-            header = line
-            stacktrace = []
-            first_line = False
-        else:
-            if not line:  # done
-                yield header, stacktrace
-                first_line = True
-                continue
-            stacktrace.append(line)
-
+from .perfutils import read_events
 
 def parse_values(parts, **types):
     values = {}
@@ -48,7 +30,7 @@ def drops_gil(stacktrace):
 
 usage = """
 
-Convert perf.data to Trace Event Format.
+Convert perf.data to TraceEvent JSON data.
 
 Usage:
 
diff --git a/per4m/perfutils.py b/per4m/perfutils.py
new file mode 100644
index 0000000..0fabe78
--- /dev/null
+++ b/per4m/perfutils.py
@@ -0,0 +1,34 @@
+def read_events(input):
+    first_line = True
+    stacktrace = []
+    header = None
+    for line in input:
+        line = line.strip()
+        if first_line:
+            header = line
+            stacktrace = []
+            first_line = False
+        else:
+            if not line:  # done
+                yield header, stacktrace
+                first_line = True
+                continue
+            stacktrace.append(line)
+    if stacktrace:
+        yield header, stacktrace
+
+
+def parse_header(header):
+    parts = header.split()
+    event = parts[4][:-1] # strip off ':'
+    if ":" in event:  # tracepoint
+        dso, triggerpid, cpu, time, _, *other = parts
+        time = float(time[:-1]) * 1e6
+        values = dict(dso=dso, triggerpid=int(triggerpid), cpu=cpu, time=time)
+        tracepoint = True
+    else:  # counter etc
+        dso, triggerpid, time, count, _, *other = parts
+        time = float(time[:-1]) * 1e6
+        values = dict(dso=dso, triggerpid=int(triggerpid), count=count, time=time)
+        tracepoint = False
+    return values, other, tracepoint
diff --git a/per4m/record.py b/per4m/record.py
new file mode 100644
index 0000000..c22da47
--- /dev/null
+++ b/per4m/record.py
@@ -0,0 +1,162 @@
+import argparse
+import contextlib
+import os
+import runpy
+import shlex
+import subprocess
+import signal
+import sys
+import time
+
+import viztracer
+import numpy
+
+
+RETRIES = 10
+
+
+usage = """
+
+Run VizTracer and perf simultaneously. See also man perf record.
+
+Usage:
+
+$ perf-pyrecord -e cycles -m per4m.example2
+"""
+
+class PerfRecord:
+    def __init__(self, output='perf.data', args=[], verbose=1):
+        self.output = output
+        self.args = args
+        self.verbose = verbose
+
+    def __enter__(self):
+        pid = os.getpid()
+        # cmd = f"perf record -e 'sched:*' --call-graph dwarf -k CLOCK_MONOTONIC --pid {pid} -o {self.output}"
+        perf_args = ' '.join(self.args)
+        cmd = f"perf record {perf_args} --call-graph dwarf -k CLOCK_MONOTONIC --pid {pid} -o {self.output}"
+        if self.verbose >= 2:
+            print(f"Running: {cmd}")
+        args = shlex.split(cmd)
+        self.perf = subprocess.Popen(args, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, stdin=subprocess.PIPE)
+        start_time = time.time()
+        for _ in range(RETRIES):
+            if os.path.exists(self.output):
+                mtime = os.path.getmtime(self.output)
+                if mtime > start_time:
+                    break
+            # we need to wait till perf creates the file
+            time.sleep(0.1)
+        else:
+            self.finish()
+            raise OSError(f'perf did not create {self.output}')
+        start_size = os.path.getsize(self.output)
+        for _ in range(RETRIES):
+            size = os.path.getsize(self.output)
+            if size > start_size:
+                break
+            # we need to wait till perf writes
+            time.sleep(0.1)
+        else:
+            self.finish()
+            raise OSError(f'perf did not write to {self.output}')
+        # and give perf a bit more time
+        time.sleep(0.05)
+        return self
+
+    def finish(self):
+        self.perf.terminate()
+        outs, errs = self.perf.communicate(timeout=5)
+        if self.verbose >= 1:
+            print(outs.decode('utf8'))
+        if errs:
+            if self.verbose >= 1:
+                print(errs.decode('utf8'))
+        if self.perf.returncode not in [0, -signal.SIGTERM.value]:
+            print(signal.SIGTERM)
+            raise OSError(f'perf record fails, got exit code {self.perf.returncode}')
+
+    def __exit__(self, *args):
+        self.finish()
+        if self.verbose >= 1:
+            print('Wait for perf to finish...')
+        self.perf.wait()
+
+
+@contextlib.contextmanager
+def empty_context():
+    yield
+
+
+def main(argv=sys.argv):
+    parser = argparse.ArgumentParser(argv[0],
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        usage=usage)
+    parser.add_argument('--module', '-m')
+    parser.add_argument('--import', dest="import_", help="Comma seperated list of modules to import before tracing (cleans up tracing output)")
+    parser.add_argument('--verbose', '-v', action='count', default=1)
+    parser.add_argument('--quiet', '-q', action='count', default=0)
+    parser.add_argument('--output', '-o', dest="output", help="Output filename (default %(default)s)", default='perf.data')
+    parser.add_argument('--output-viztracer', help="Output filename for viztracer (default %(default)s)", default='viztracer.json')
+
+    parser.add_argument('--freq', '-F', help="Profile frequency, passed down to perf record (see man perf record)")
+    parser.add_argument('--event', '-e', dest='events', help="Select PMU event, passed down to perf record (see man perf record)", action='append', default=[])
+
+    parser.add_argument('--tracer_entries', type=int, default=1000000, help="See viztracer --help")
+
+    # # these gets passed to stacktraceinject
+    # parser.add_argument('--keep-cpython-evals', help="keep CPython evaluation stacktraces (instead of replacing) (default: %(default)s)", default=True, action='store_true')
+    # parser.add_argument('--no-keep-cpython-evals', dest="keep_cpython_evals", action='store_false')
+    # parser.add_argument('--allow-mismatch', help="Keep going even when we cannot match the C and Python stacktrace (default: %(default)s)", default=False, action='store_true')
+    # parser.add_argument('--no-allow-mismatch', dest="allow_mismatch", action='store_false')
+    # parser.add_argument('--pedantic', help="If false, accept known stack mismatch issues (default: %(default)s)", default=False, action='store_true')
+    # parser.add_argument('--no-pedantic', dest="pedantic", action='store_false')
+
+    parser.add_argument('args', nargs=argparse.REMAINDER)
+    args = parser.parse_args(argv[1:])
+    verbose = args.verbose - args.quiet
+
+    if args.import_:
+        for module in args.import_.split(','):
+            if verbose >= 2:
+                print(f'importing {module}')
+                __import__(module)
+
+    viztracer_path = args.output_viztracer
+    ctx = contextlib.redirect_stdout(None) if verbose == 0 else empty_context()
+    perf_args = []
+    if args.freq:
+        perf_args.append(f' --freq={args.freq}')
+    for event in args.events:
+        perf_args.append(f' -e {event}')
+    with ctx:
+        with PerfRecord(verbose=verbose, args=perf_args) as perf:
+            with viztracer.VizTracer(output_file=viztracer_path, verbose=verbose, tracer_entries=args.tracer_entries):
+                if args.module:
+                    runpy.run_module(args.module)
+                else:
+                    sys.argv = args.args
+                    runpy.run_path(sys.argv[0])
+    # forward = ""
+    # if args.keep_cpython_evals:
+    #     forward += ' --keep-cpython-evals'
+    # else:
+    #     forward += ' --no-keep-cpython-evals'
+    # if args.allow_mismatch:
+    #     forward += ' --allow-mismatch'
+    # else:
+    #     forward += ' --no-allow-mismatch'
+    # if args.pedantic:
+    #     forward += ' --pedantic'
+    # else:
+    #     forward += ' --no-pedantic'
+    # if args.output:
+    #     forward += ' --output=args.output'
+
+    # cmd = f"perf script -i {perf.perf_output} --no-inline | per4m stacktraceinject {forward} --input {viztracer_path}"
+    # if os.system(cmd) != 0:
+    #     raise OSError(f'Failed to run perf or per4m perf2trace, command:\n$ {cmd}')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/per4m/script.py b/per4m/script.py
new file mode 100644
index 0000000..d723b76
--- /dev/null
+++ b/per4m/script.py
@@ -0,0 +1,201 @@
+import argparse
+import json
+import shlex
+import subprocess
+import sys
+
+from viztracer.prog_snapshot import ProgSnapshot
+
+
+from .perfutils import read_events, parse_header
+
+usage = """
+
+Take stacktraces from VizTracer, and inject them in perf script output
+
+Usage:
+
+$ perf-pyscript | per4m stacktraceinject --input viztracer.json
+"""
+
+def print_stderr(*args):
+    print(*args, file=sys.stderr)
+
+def stacktrace_inject(stacktrace, snapshot, pid, time, keep_cpython_evals=False, perror=print_stderr, allow_mismatch=False, pedantic=False):
+    # we'll do in place modification
+    stacktrace = stacktrace.copy()
+    # First, we build op parts of the callstack from VizTracers
+    # We can go into and out of Python/C several times
+    pystacktraces = [[]]
+    pystacktrace_part = pystacktraces[-1]
+
+    snapshot.goto_tid(pid)
+    snapshot.goto_timestamp(time)
+    bottom_frame = frame = snapshot.curr_frame
+    was_in_python = frame.node.is_python
+    while frame:
+        node = frame.node
+        # output should become sth like: ffffffff9700008c entry_SYSCALL_64_after_hwframe+0x44 ([kernel.kallsyms])
+        filename = node.filename
+        if filename is None:
+            location = ''
+        else:
+            if 'frozen' in filename:
+                filename = '<frozen>'
+            location = f"::{filename}:{node.lineno}"
+        funcname = node.funcname or node.fullname
+        if node.is_python:
+            type = 'py'
+        else:
+            type = 'cext'
+        # lineno = f':{node.lineno}' if node.lineno is not None else ''
+        if node.is_python:
+            if not was_in_python:
+                # this is a jump back to into c, which we allow
+                # pystacktraces.append([])
+                # pystacktrace_part = pystacktraces[-1]
+                pass
+        else:
+            if was_in_python:
+                # this is a jump into Python
+                pystacktraces.append([])
+                pystacktrace_part = pystacktraces[-1]
+        pystacktrace_part.append(f'000000000000000000000000 {type}::{funcname}{location} ([{filename}])')
+        was_in_python = node.is_python
+        frame = frame.parent
+    
+    # Next, we find where in the perf output, we were in the Python evaluate loop (multiple places possible)
+    # A list of (index_min, index_max) where the Python stacktraces should be injected/replaced
+    pyslices = []
+
+    index_min = None
+    index_max = None
+    is_py = [False] * len(stacktrace)
+    # The C Part of Python callstack should begin with a function with this name in it
+    begin_functions = ['PyEval_Eval']
+    # and all function call after that may be one that contains these names
+    continue_functions = ['PyEval_Eval', 'PyFunction_FastCall', 'function_code_fastcall', 'method_call', 'call_function', 'PyCFunction_FastCall', 'PyObject_FastCall']
+    # we could make the code simpler by flipping the stacktrace, so the bottom of the stack starts at index 0
+    # (instead of reversed)
+    for i, call in enumerate(stacktrace[::-1]):
+        i = len(stacktrace) - i - 1
+        # Or we found the beginning, or a continuation of the evaluation loop
+        if (index_max is None and any(sig in call for sig in begin_functions)) or\
+            (index_max is not None and any(sig in call for sig in continue_functions)):
+            if index_max is None:
+                index_max = i
+            index_min = i
+        else:
+            if index_max is not None:
+                pyslices.append((index_min, index_max))
+                is_py[index_min:index_max] = [True] * (index_max - index_min)
+                index_min = index_max = None
+    if index_max is not None:
+        pyslices.append((index_min, index_max))
+    
+    # Lets give some nice output in case it does not match up
+    if len(pystacktraces) != len(pyslices):
+        if not pedantic:
+            # for these we know that we often get a mismatch, and they seem reasonable to ignore
+            known_failures = ['pythread_wrapper', 'switch_fpu_return', 'do_fork', 'ret_from_fork', 'start_thread']
+            for known_failure in known_failures:
+                if any(known_failure in call for call in stacktrace) and not any('_Py' in call for call in stacktrace):
+                    perror(f'Seen {known_failure} in stacktrace during, silently ignoring the stack mismatch, use --no-pedantic to not ignore this')
+                    return stacktrace
+        perror(f"OOOOPS, could not match stacktraces at time {time} for pid {pid}!")
+        perror("Where VizTracer thinks we are:")
+        perror(bottom_frame.node.fullname)
+        try:
+            bottom_frame.show(perror)
+        except:
+            perror('Oops, viztracer fails showing')
+        # snap.where()
+        perror("Callstack slices where we think we should inject the Python callstack:")
+        for pyslice in pyslices:
+            perror(pyslice)
+        perror("Callstack from perf, where * indicates where we think the Python eval loop is:")
+        for i, call in enumerate(stacktrace):
+            perror(i, '*' if is_py[i] else ' ', call)
+        
+        perror("Python call stacks, as found from VizTracer:")
+        for pystacktraces_part in pystacktraces:
+            perror('-----')
+            for i, call in enumerate(pystacktraces_part):
+                perror(call)
+        perror('-----')
+        if not allow_mismatch:
+            raise ValueError('Stack traces from VizTracer and perf could not be matched, run with --allow-mismatch and inspect output carefully')
+    else:
+        for pyslice, pystacktraces_part in zip(pyslices, reversed(pystacktraces)):
+            index_min, index_max = pyslice
+            if keep_cpython_evals:
+                for i in range(index_min, index_max+1):
+                    ptr, call = stacktrace[i].split(' ', 1)
+                    stacktrace[i] = f'{ptr} cpyeval::{call}'
+            else:
+                # mark for delection, to avoid indices getting messed up
+                stacktrace[index_min:index_max+1] = [None] * (index_max - index_min + 1)
+            for pystack in reversed(pystacktraces_part):
+                stacktrace.insert(index_min, pystack)
+        stacktrace = [k for k in stacktrace if k is not None]
+    return stacktrace
+
+
+def main(argv=sys.argv):
+    parser = argparse.ArgumentParser(argv[0],
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        usage=usage)
+    parser.add_argument('--verbose', '-v', action='count', default=0)  # by default quiet, since we write to stdout
+    parser.add_argument('--quiet', '-q', action='count', default=0)
+    parser.add_argument('--keep-cpython-evals', help="keep CPython evaluation stacktraces (instead of replacing) (default: %(default)s)", default=False, action='store_true')
+    parser.add_argument('--no-keep-cpython-evals', dest="keep_cpython_evals", action='store_false')
+    parser.add_argument('--allow-mismatch', help="Keep going even when we cannot match the C and Python stacktrace (default: %(default)s)", default=False, action='store_true')
+    parser.add_argument('--no-allow-mismatch', dest="allow_mismatch", action='store_false')
+    parser.add_argument('--pedantic', help="If false, accept known stack mismatch issues (default: %(default)s)", default=False, action='store_true')
+    parser.add_argument('--no-pedantic', dest="pedantic", action='store_false')
+    parser.add_argument('--input', '-i', help="VizTracer input (default %(default)s)", default="viztracer.json")
+    parser.add_argument('--output', '-o', dest="output", default=None, help="Output filename (default %(default)s)")
+    
+
+    args = parser.parse_args(argv[1:])
+    verbose = args.verbose - args.quiet
+
+    perf_script_args = ['--no-inline']
+    perf_script_args = ' '.join(perf_script_args)
+    cmd = f"perf script {perf_script_args}"
+    if verbose >= 2:
+        print(f"Running: {cmd}")
+    perf_args = shlex.split(cmd)
+    perf = subprocess.Popen(perf_args, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) #, stdin=subprocess.PIPE)
+
+    if args.output is None:
+        output = sys.stdout
+    else:
+        output = open(args.output, "w")
+    
+    if verbose >= 1:
+        print_stderr("Loading snapshot")
+    with open(args.input, "r") as f:
+        json_data = f.read()
+    snap = ProgSnapshot(json_data)
+    # find all pids (or tids)
+    pids = list(snap.func_trees)
+    for pid in pids.copy():
+        pids.extend(list(snap.func_trees[pid]))
+    t0 = min(event['ts'] for event in json.loads(json_data)['traceEvents'])
+    
+    for header, stacktrace in read_events(perf.stdout):
+        print(header, file=output)
+        values, _, _ = parse_header(header)
+        time = values['time'] - t0
+        triggerpid = values['triggerpid']
+        if triggerpid in pids:
+            try:
+                stacktrace = stacktrace_inject(stacktrace, snap, triggerpid, time, keep_cpython_evals=args.keep_cpython_evals, allow_mismatch=args.allow_mismatch, pedantic=args.pedantic)
+                for call in stacktrace:
+                    print(call, file=output)
+            except:
+                print(f"Error for event: {header}")
+                raise
+        print(file=output)
+
diff --git a/setup.py b/setup.py
index 9636fdf..9264ad6 100644
--- a/setup.py
+++ b/setup.py
@@ -16,6 +16,11 @@ setup(
     use_scm_version=True,
     setup_requires=['setuptools_scm'],
     entry_points={
-        'console_scripts': ['per4m = per4m.__main__:main']
+        'console_scripts': [
+            'per4m = per4m.__main__:main',
+            'giltracer = per4m.giltracer:main',
+            'perf-pyrecord = per4m.record:main',
+            'perf-pyscript = per4m.script:main',
+        ]
     }
-)
\ No newline at end of file
+)