From 67c25d37e5c57877fdf1ae3fe16dc5001e7f8b56 Mon Sep 17 00:00:00 2001
From: "Maarten A. Breddels" <maartenbreddels@gmail.com>
Date: Thu, 17 Dec 2020 17:00:55 +0100
Subject: [PATCH] feat: giltracer shows actualy GIL using uprobes

---
 README.md           | 210 +++++++++++++++++++++++++++++++++++++++++++-
 per4m/example2.py   |  11 +--
 per4m/example3.py   |  24 ++---
 per4m/giltracer.py  |  81 ++++++++++++++---
 per4m/offgil.py     |  14 +--
 per4m/perf2trace.py | 208 +++++++++++++++++++++++++++++++++++++++++--
 per4m/record.py     |  22 +++--
 7 files changed, 517 insertions(+), 53 deletions(-)

diff --git a/README.md b/README.md
index 4e5d0a5..7e116f2 100644
--- a/README.md
+++ b/README.md
@@ -9,12 +9,13 @@ From PyPy
     $ pip install per4m
 
 
-## Linux side
+## Linux side (minimal)
 
 Install perf
 
     $ sudo yum install perf
 
+### Enable perf as user
 Enable users to run perf (use at own risk)
 
     $ sudo sysctl kernel.perf_event_paranoid=-1
@@ -24,18 +25,191 @@ Enable users to see schedule trace events:
     $ sudo mount -o remount,mode=755 /sys/kernel/debug
     $ sudo mount -o remount,mode=755 /sys/kernel/debug/tracing
 
+### Create uprobes (to detect who has the GIL)
+
+We use uprobes to see who calls take_gil and drop_gil, so we need to add those uprobes manually:
+
+```
+sudo perf probe -f -x `which python` python:take_gil=take_gil
+sudo perf probe -f -x `which python` python:take_gil=take_gil%return
+sudo perf probe -f -x `which python` python:drop_gil=drop_gil
+sudo perf probe -f -x `which python` python:drop_gil=drop_gil%return
+```
+
+It should give outlike like:
+```
+Failed to find "take_gil%return",
+ because take_gil is an inlined function and has no return point.
+Added new event:
+  python:take_gil__return (on take_gil%return in /home/maartenbreddels/miniconda/envs/dev/bin/python3.7)
+```
+
+Check if this works
+
+```
+$ perf record -e python:take_gil__return -- python -c "import time; time.sleep(1)"
+[ perf record: Woken up 1 times to write data ]
+[ perf record: Captured and wrote 0,063 MB perf.data (563 samples) ]
+```
+
+In case you need to remove the uprobes:
+
+    $ sudo perf probe --del 'py*'
+
+
 # Usage
 
-    $ per4m giltracer -m per4m.example2
 
-Open the result.html, and identify the problem (GIL visible, possible low instruction counts/cycle):
+## Detect who has the GIL
 
+```
+$ giltracer -m per4m.example2
+
+[ perf record: Woken up 1 times to write data ]
+[ perf record: Captured and wrote 0,044 MB perf-gil.data (116 samples) ]
+
+Wait for perf to finish...
+Loading finish                                        
+Saving report to /home/maartenbreddels/github/maartenbreddels/per4m/viztracer.json ...
+Dumping trace data to json, total entries: 72, estimated json file size: 8.4KiB
+Report saved.
+perf script -i perf-gil.data --no-inline --ns | per4m perf2trace gil -i viztracer.json -o giltracer.json -q -v 
+
+Summary of threads:
+
+    PID    total(us)    no gil%✅    has gil%❗    gil wait%❌
+-------  -----------  -----------  ------------  -------------
+2718963     267335     95.8333         0.353181        3.81349
+2719035      56845.9    0.10576       54.7924         45.1019
+2719036      57043.1    0.0279231     54.6021         45.37
+
+High 'no gil' is good (✅), we like low 'has gil' (❗), and we don't want 'gil wait' (❌)
+
+Wrote to giltracer.json
+Saving report to /home/maartenbreddels/github/maartenbreddels/per4m/giltracer.html ...
+Dumping trace data to json, total entries: 362, estimated json file size: 42.4KiB
+Generating HTML report
+Report saved.
+```
+
+This gives an overview of which threads held the GIL, and who needed to wait to get the GIL:
+
+The giltracer.html file gives a visual overview of where a threads want to take the GIL, and where it has the GIL.
+![image](https://user-images.githubusercontent.com/1765949/102506830-d1390300-4083-11eb-9ca2-d311c2ba930b.png)
+
+## See process states
+
+Instead of detecting the GIL, we can also look at process states, and see if and where processes sleep due to the GIL:
+```
+$ giltracer --no-gil-detect --state-detect -m per4m.example2
+```
 
 ![image](https://user-images.githubusercontent.com/1765949/102187104-db0c0c00-3eb3-11eb-93ef-e6d938d9e349.png)
 
-
 The dark red `S(GIL)` blocks indicate the threads/processes are in a waiting state due to the GIL, dark orange `S` is a due to other reasons (like `time.sleep(...)`). The regular pattern is due to Python switching threads after [`sys.getswitchinterval`](https://docs.python.org/3/library/sys.html#sys.getswitchinterval) (0.005 seconds)
 
+## GIL + Process states
+
+Although it is possible to do both:
+```
+$ giltracer --no-gil-detect --state-detect -m per4m.example2
+```
+
+![image](https://user-images.githubusercontent.com/1765949/102507696-d8acdc00-4084-11eb-8fed-0b75c88906c4.png)
+
+This causes clutter, and perf often loses messages.
+
+
+## What you'd like to see
+
+If you use NumPy, it will release the GIL, and you should see less waiting on the GIL, e.g the following should do much better.
+```
+# shows high gil load, but no wait
+import threading
+import numpy as np
+
+
+N = 1024*1024*32
+M = 4
+x = np.arange(N, dtype='f8')
+
+def run():
+    total = 0
+    for i in range(M):
+        total += x.sum()
+    return total
+
+
+thread1 = threading.Thread(target=run)
+thread2 = threading.Thread(target=run)
+
+def main(args=None):
+    thread1.start()
+    thread2.start()
+    total = 0
+    for i in range(1_000_000):
+        total += i
+    for thread in [thread1, thread2]:
+        thread.join()
+```
+
+
+```
+$ giltracer -m per4m.example3
+
+[ perf record: Woken up 1 times to write data ]
+[ perf record: Captured and wrote 0,045 MB perf-gil.data (112 samples) ]
+
+Wait for perf to finish...
+Loading finish                                        
+Saving report to /home/maartenbreddels/github/maartenbreddels/per4m/viztracer.json ...
+Dumping trace data to json, total entries: 95, estimated json file size: 11.1KiB
+Report saved.
+perf script -i perf-gil.data --no-inline --ns | per4m perf2trace gil -i viztracer.json -o giltracer.json -q -v 
+
+Summary of threads:
+
+    PID    total(us)    no gil%✅    has gil%❗    gil wait%❌
+-------  -----------  -----------  ------------  -------------
+2724006     109011        71.0196     28.668          0.312345
+2724077      58313.4      90.3658      0.558379       9.07583
+2724078      55886.6      92.3029      0.459127       7.23798
+
+High 'no gil' is good (✅), we like low 'has gil' (❗), and we don't want 'gil wait' (❌)
+
+Wrote to giltracer.json
+Saving report to /home/maartenbreddels/github/maartenbreddels/per4m/giltracer.html ...
+Dumping trace data to json, total entries: 375, estimated json file size: 43.9KiB
+Generating HTML report
+Report saved.
+```
+
+We see that the two threads waiting on the GIL just 10% of the time (room for improvement, left as an excercise for the reader).
+![image](https://user-images.githubusercontent.com/1765949/102508887-3130a900-4086-11eb-818f-3426e1776320.png)
+
+
+## Who is waiting on the GIL
+
+Analougous to [Brendan Gregg's off cpu analysis](http://www.brendangregg.com/offcpuanalysis.html) we'd like to know in Python who is waiting for the GIL, and we also want to see the the C stacktrace and possibly what the kernel is doing.
+
+Note that we we are looking at stacktraces that are both off cpu and off gil (since off gil but on cpu should be good).
+```
+$ offgil | ~/github/FlameGraph/stackcollapse.pl | ~/github/FlameGraph/flamegraph.pl --countname=us --title="Off-GIL Time Flame Graph" --colors=python > offgil.svg
+```
+
+![image](https://user-images.githubusercontent.com/1765949/102510448-eca60d00-4087-11eb-81e9-1ff2f1013e93.png)
+
+
+Or, if you'd like to see the CPython evaluation stacktraces, and what happens in the kernel:
+```
+$ offgil --no-strip-take-gil --keep-cpython-evals | ~/github/FlameGraph/stackcollapse.pl | ~/github/FlameGraph/flamegraph.pl --countname=us --title="Off-GIL Time Flame Graph" --colors=python > offgil.svg
+```
+
+We see that NumPy has trouble returning to Python land because the main thread has the GIL locked.
+
+![image](https://user-images.githubusercontent.com/1765949/102510646-24ad5000-4088-11eb-97b9-653b3d113231.png)
+
+
 # Usage - Jupyter notebook
 
 First, load the magics
@@ -165,3 +339,31 @@ Show one thread that has a high GIL load, but it does not keep the others from r
     $ per4m giltracer --import="numpy,threading,time,gil_load" -m per4m.example3
 
 ![image](https://user-images.githubusercontent.com/1765949/102223915-96996400-3ee5-11eb-9e2e-46ac6fd5c5e3.png)
+
+
+# Using uprobes
+
+In case you are interested in using uprobes yourself, this is how you use it:
+```
+$ perf record -e 'python:*'  -- time python -m per4m.example3
+1.73user 3.44system 0:00.95elapsed 544%CPU (0avgtext+0avgdata 297224maxresident)k
+0inputs+0outputs (0major+6393minor)pagefaults 0swaps
+[ perf record: Woken up 4 times to write data ]
+[ perf record: Captured and wrote 0,969 MB perf.data (12217 samples) ]
+```
+
+And see what we got:
+```
+$ perf script --no-inline | tail
+          python 2397983 [058] 3846350.744446: python:take_gil__return: (563fc316fcc0 <- 563fc315691f)
+          python 2397982 [057] 3846350.749502: python:take_gil__return: (563fc316fcc0 <- 563fc315691f)
+          python 2397983 [058] 3846350.754557: python:take_gil__return: (563fc316fcc0 <- 563fc315691f)
+          python 2397982 [057] 3846350.759612: python:take_gil__return: (563fc316fcc0 <- 563fc315691f)
+          python 2397983 [058] 3846350.764668: python:take_gil__return: (563fc316fcc0 <- 563fc315691f)
+          python 2397982 [057] 3846350.769725: python:take_gil__return: (563fc316fcc0 <- 563fc315691f)
+          python 2397983 [058] 3846350.772506: python:take_gil__return: (563fc316fcc0 <- 563fc315691f)
+          python 2397918 [056] 3846350.777600: python:take_gil__return: (563fc316fcc0 <- 563fc309a0b4)
+          python 2397983 [058] 3846350.777614: python:take_gil__return: (563fc316fcc0 <- 563fc315691f)
+          python 2397918 [056] 3846350.842284: python:take_gil__return: (563fc316fcc0 <- 563fc309a0b4)
+```
+And we see the two threads exchanging the GIL at about every 0.005 seconds.
diff --git a/per4m/example2.py b/per4m/example2.py
index 66b2367..33c3bcd 100644
--- a/per4m/example2.py
+++ b/per4m/example2.py
@@ -13,8 +13,9 @@ def run():
 
 thread1 = threading.Thread(target=run)
 thread2 = threading.Thread(target=run)
-thread1.start()
-thread2.start()
-time.sleep(0.2)
-for thread in [thread1, thread2]:
-    thread.join()
+def main(args=None):
+    thread1.start()
+    thread2.start()
+    time.sleep(0.2)
+    for thread in [thread1, thread2]:
+        thread.join()
diff --git a/per4m/example3.py b/per4m/example3.py
index bbebb60..0975979 100644
--- a/per4m/example3.py
+++ b/per4m/example3.py
@@ -28,15 +28,17 @@ if use_gil_load:
 
 thread1 = threading.Thread(target=run)
 thread2 = threading.Thread(target=run)
-thread1.start()
-thread2.start()
-total = 0
-for i in range(1_000_000):
-    total += i
-for thread in [thread1, thread2]:
-    thread.join()
 
-if use_gil_load:
-    gil_load.stop()
-    stats = gil_load.get()
-    print(gil_load.format(stats))
\ No newline at end of file
+def main(args=None):
+    thread1.start()
+    thread2.start()
+    total = 0
+    for i in range(1_000_000):
+        total += i
+    for thread in [thread1, thread2]:
+        thread.join()
+
+    if use_gil_load:
+        gil_load.stop()
+        stats = gil_load.get()
+        print(gil_load.format(stats))
diff --git a/per4m/giltracer.py b/per4m/giltracer.py
index 6df9c60..829d938 100644
--- a/per4m/giltracer.py
+++ b/per4m/giltracer.py
@@ -17,20 +17,39 @@ Usage:
 $ giltracer -m per4m.example1
 """
 
-class GilTracer(PerfRecord):
-    def __init__(self, output='perf.data', trace_output='giltracer.json', verbose=1):
+class PerfRecordSched(PerfRecord):
+    def __init__(self, output='perf-sched.data', trace_output='schedtracer.json', verbose=1):
         super().__init__(output=output, verbose=verbose, args=["-e 'sched:*'"])
         self.trace_output = trace_output
         self.verbose = verbose
 
-    def __exit__(self, *args):
-        super().__exit__(self, *args)
+    def post_process(self, *args):
         verbose = '-q ' + '-v ' * self.verbose
-        cmd = f"perf script -i {self.output} --no-inline | per4m perf2trace -o {self.trace_output} {verbose}"
+        cmd = f"perf script -i {self.output} --no-inline | per4m perf2trace sched -o {self.trace_output} {verbose}"
+        if self.verbose >= 1:
+            print(cmd)
         if os.system(cmd) != 0:
             raise OSError(f'Failed to run perf or per4m perf2trace, command:\n$ {cmd}')
 
 
+class PerfRecordGIL(PerfRecord):
+    def __init__(self, output='perf-gil.data', trace_output='giltracer.json', viztracer_input="viztracer.json", verbose=1):
+        # TODO: check output of perf probe --list="python:*gil*"  to see if probes are set
+        super().__init__(output=output, verbose=verbose, args=["-e 'python:*gil*'"], stacktrace=False)
+        # this is used to filter the giltracer data
+        self.viztracer_input = viztracer_input
+        self.trace_output = trace_output
+        self.verbose = verbose
+
+    def post_process(self, *args):
+        verbose = '-q ' + '-v ' * self.verbose
+        # -i {self.viztracer_input}   # we don't use this ftm
+        cmd = f"perf script -i {self.output} --no-inline --ns | per4m perf2trace gil -o {self.trace_output} {verbose}"
+        if self.verbose >= 1:
+            print(cmd)
+        if os.system(cmd) != 0:
+            raise OSError(f'Failed to run perf or per4m perf2trace, command:\n$ {cmd}')
+
 def main(argv=sys.argv):
     parser = argparse.ArgumentParser(argv[0],
         formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -39,7 +58,12 @@ def main(argv=sys.argv):
     parser.add_argument('--import', dest="import_", help="Comma seperated list of modules to import before tracing (cleans up tracing output)")
     parser.add_argument('--verbose', '-v', action='count', default=1)
     parser.add_argument('--quiet', '-q', action='count', default=0)
-    parser.add_argument('--output', '-o', dest="output", default='result.html', help="Output filename (default %(default)s)")
+    parser.add_argument('--output', '-o', dest="output", default='giltracer.html', help="Output filename (default %(default)s)")
+    parser.add_argument('--state-detect', help="Use perf sched events to detect if a process is sleeping due to the GIL (default: %(default)s)", default=False, action='store_true')
+    parser.add_argument('--no-state-detect', dest="state_detect", action='store_false')
+    parser.add_argument('--gil-detect', help="Use uprobes to detect who has the GIL (read README.md) (default: %(default)s)", default=True, action='store_true')
+    parser.add_argument('--no-gil-detect', dest="gil_detect", action='store_false')
+
     parser.add_argument('args', nargs=argparse.REMAINDER)
     args = parser.parse_args(argv[1:])
     verbose = args.verbose - args.quiet
@@ -50,14 +74,43 @@ def main(argv=sys.argv):
                 print(f'importing {module}')
             __import__(module)
 
-    with GilTracer(verbose=verbose) as gt:
-        with viztracer.VizTracer(output_file="viztracer.json", verbose=verbose):
-            if args.module:
-                runpy.run_module(args.module)
-            else:
-                sys.argv = args.args
-                runpy.run_path(sys.argv[0])
-    builder = ReportBuilder(['./viztracer.json', './giltracer.json'], verbose=verbose)
+    perf1 = PerfRecordSched(verbose=verbose) if args.state_detect else None
+    perf2 = PerfRecordGIL(verbose=verbose) if args.gil_detect else None
+    vt = viztracer.VizTracer(output_file="viztracer.json", verbose=verbose)
+
+    # pass on the rest of the arguments
+    sys.argv = args.args
+    if args.module:
+        module = runpy.run_module(args.module)
+    else:
+        module = runpy.run_path(sys.argv[0])
+
+    if perf1:
+        perf1.start()
+    if perf2:
+        perf2.start()
+
+    try:
+        vt.start()
+        module['main'](args.args)
+    finally:
+        vt.stop()
+        if perf1:
+            perf1.stop()
+        if perf2:
+            perf2.stop()
+        vt.save('viztracer.json')
+        if perf1:
+            perf1.post_process()
+        if perf2:
+            perf2.post_process()
+
+    files = ['viztracer.json']
+    if perf1:
+        files.append('schedtracer.json')
+    if perf2:
+        files.append('giltracer.json')
+    builder = ReportBuilder(files, verbose=verbose)
     builder.save(output_file=args.output)
 
 
diff --git a/per4m/offgil.py b/per4m/offgil.py
index 5d20d90..a8b22ae 100644
--- a/per4m/offgil.py
+++ b/per4m/offgil.py
@@ -19,7 +19,10 @@ Take stacktraces from VizTracer, and inject them in perf script output and print
 
 Usage:
 
-$ giltracer -m per4m.example3
+# for this we need the process states, and we can skip the GIL detection for performance
+$ giltracer --no-gil-detect --state-detect -m per4m.example3
+
+# offgil will use perf-sched.data
 $ offgil | ~/github/FlameGraph/stackcollapse.pl | ~/github/FlameGraph/flamegraph.pl --countname=us --title="Off-GIL Time Flame Graph" --colors=python > offgil.svg
 """
 
@@ -39,7 +42,8 @@ def main(argv=sys.argv):
     parser.add_argument('--no-allow-mismatch', dest="allow_mismatch", action='store_false')
     parser.add_argument('--pedantic', help="If false, accept known stack mismatch issues (default: %(default)s)", default=False, action='store_true')
     parser.add_argument('--no-pedantic', dest="pedantic", action='store_false')
-    parser.add_argument('--input', '-i', help="VizTracer input (default %(default)s)", default="viztracer.json")
+    parser.add_argument('--input-perf', help="Perf input (default %(default)s)", default="perf-sched.data")
+    parser.add_argument('--input-viztracer', help="VizTracer input (default %(default)s)", default="viztracer.json")
     parser.add_argument('--output', '-o', dest="output", default=None, help="Output filename (default %(default)s)")
     
 
@@ -48,7 +52,7 @@ def main(argv=sys.argv):
 
     perf_script_args = ['--no-inline']
     perf_script_args = ' '.join(perf_script_args)
-    cmd = f"perf script {perf_script_args}"
+    cmd = f"perf script {perf_script_args} -i {args.input_perf}"
     if verbose >= 2:
         print(f"Running: {cmd}")
     perf_args = shlex.split(cmd)
@@ -61,14 +65,14 @@ def main(argv=sys.argv):
     
     if verbose >= 1:
         print_stderr("Loading snapshot")
-    with open(args.input, "r") as f:
+    with open(args.input_viztracer, "r") as f:
         json_data = f.read()
     snap = ProgSnapshot(json_data)
     # find all pids (or tids)
     pids = list(snap.func_trees)
     for pid in pids.copy():
         pids.extend(list(snap.func_trees[pid]))
-    t0 = min(event['ts'] for event in json.loads(json_data)['traceEvents'])
+    t0 = min(event['ts'] for event in json.loads(json_data)['traceEvents'] if 'ts' in event)
     
     for header, stacktrace, event in perf2trace(perf.stdout, verbose):
         if event['name'] == args.state:
diff --git a/per4m/perf2trace.py b/per4m/perf2trace.py
index 6d8dfee..bb3e861 100644
--- a/per4m/perf2trace.py
+++ b/per4m/perf2trace.py
@@ -1,6 +1,12 @@
-import sys
-import json
 import argparse
+import argparse
+from collections import defaultdict
+import json
+import re
+import sys
+
+import tabulate
+
 from .perfutils import read_events
 
 def parse_values(parts, **types):
@@ -32,13 +38,22 @@ usage = """
 
 Convert perf.data to TraceEvent JSON data.
 
-Usage:
+Usage for scheduling data:
 
 Always run perf with -e 'sched:*' --call-graph dwarf -k CLOCK_MONOTONIC, the rest of the events are extra
 $ perf record -e 'sched:*' --call-graph dwarf -k CLOCK_MONOTONIC -e L1-dcache-load-misses -e instructions -e cycles -e page-faults -- python -m per4m.example1 
 Run with --no-online, otherwise it may run slow
-$ perf script --no-inline | per4m perf2trace --no-running -o example1perf.json
-$ viztracer --combine example1.json example1perf.json -o example1.html
+$ perf script --no-inline | per4m perf2trace sched --no-running -o example1sched.json
+$ viztracer --combine example1sched.json example1perf.json -o example1.html
+
+Usage for GIL:
+
+(read the docs to install the GIL uprobes)
+$ perf record -e 'python:*gil*' -k CLOCK_MONOTONIC -- python -m per4m.example1
+$ perf script --ns --no-inline | per4m perf2trace gil -o example1gil.json
+$ viztracer --combine example1.json example1gil.json -o example1.html
+
+
 """
 
 def main(argv=sys.argv):
@@ -51,9 +66,16 @@ def main(argv=sys.argv):
     parser.add_argument('--no-sleeping', dest="sleeping", action='store_false')
     parser.add_argument('--running', help="show running phase (default: %(default)s)", default=False, action='store_true')
     parser.add_argument('--no-running', dest="running", action='store_false')
+    parser.add_argument('--as-async', help="show as async (above the events) (default: %(default)s)", default=True, action='store_true')
+    parser.add_argument('--no-as-async', dest="as_async", action='store_false')
+    parser.add_argument('--only-lock', help="show only when we have the GIL (default: %(default)s)", default=False, action='store_true')
+    parser.add_argument('--no-only-lock', dest="as_async", action='store_false')
     parser.add_argument('--all-tracepoints', help="store all tracepoints phase (default: %(default)s)", default=False, action='store_true')
 
+    # parser.add_argument('--input', '-i', help="Optional VizTracer input for filtering and gil load calculations")
+
     parser.add_argument('--output', '-o', dest="output", default='perf.json', help="Output filename (default %(default)s)")
+    parser.add_argument("type", help="Type of conversion to do", choices=['sched', 'gil'])
 
 
     args = parser.parse_args(argv[1:])
@@ -62,14 +84,184 @@ def main(argv=sys.argv):
     store_sleeping = args.sleeping
 
     trace_events = []
-    for header, tb, event in perf2trace(sys.stdin, verbose=verbose, store_runing=store_runing, store_sleeping=store_sleeping, all_tracepoints=args.all_tracepoints):
-        trace_events.append(event)
+    if args.type == "sched":
+        for header, tb, event in perf2trace(sys.stdin, verbose=verbose, store_runing=store_runing, store_sleeping=store_sleeping, all_tracepoints=args.all_tracepoints):
+            trace_events.append(event)
+    elif args.type == "gil":
+        # we don't use this any more, lets keep it to maybe pull out thread information
+        # if args.input:
+        #     with open(args.input) as f:
+        #         input_events = json.load(f)['traceEvents']
+        #     for event in input_events:
+        #         if 'ts' in event:
+        #             ts = event['ts']
+        #             pid = int(event['tid'])
+        #             t_min[pid] = min(t_min.get(pid, ts), ts)
+        #             t_max[pid] = max(t_max.get(pid, ts), ts)
+
+        for header, event in gil2trace(sys.stdin, verbose=verbose, as_async=args.as_async, only_lock=args.only_lock):
+            if verbose >= 3:
+                print(event)
+            trace_events.append(event)
+    else:
+        raise ValueError(f'Unknown type {args.type}')
     with open(args.output, 'w') as f:
         json.dump({'traceEvents': trace_events}, f)
     if verbose >= 1:
         print(f"Wrote to {args.output}")
 
 
+def gil2trace(input, verbose=1, take_probe="python:take_gil(_\d)?$", take_probe_return="python:take_gil__return", drop_probe="python:drop_gil(_\d)?$", drop_probe_return="python:drop_gil__return", as_async=False, show_instant=True, duration_min_us=1, only_lock=True, t_min={}, t_max={}):
+    time_first = None
+
+    # dicts that map pid -> time
+    wants_take_gil = {}
+    wants_drop_gil = {}
+    has_gil = {}
+
+    parent_pid = None
+    # to avoid printing out the same msg over and over
+    ignored = set()
+    # keep track of various times
+    time_on_gil = defaultdict(int)
+    time_wait_gil = defaultdict(int)
+    jitter = 1e-3  # add 1 ns for proper sorting
+    for header in input:
+        try:
+            header = header.rstrip()
+            if verbose >= 2:
+                print(header)
+
+            # just ignore stack traces and seperators
+            if not header or header.split()[0].isdigit():
+                continue
+            # parse the header
+            comm, pid, cpu, time, event, *other = header.split()
+            pid = int(pid)
+            if parent_pid is None:  # lets assume the first event is from the parent process
+                parent_pid = pid
+            assert event[-1] == ':'
+            event = event[:-1]  # take off :
+            assert time[-1] == ':'
+            time = time[:-1]  # take off :
+            time = float(time[:-1]) * 1e6
+
+            # keeping track for statistics
+            t_min[pid] = min(time, t_min.get(pid, time))
+            t_max[pid] = max(time, t_max.get(pid, time))
+
+            # and proces it
+            if time_first is None:
+                time_first = time
+
+            if re.match(take_probe, event):
+                wants_take_gil[pid] = time
+                scope = "t"  # thread scope
+                yield header, {"pid": parent_pid, "tid": pid, "ts": time, "name": 'take', "ph": "i", "cat": "GIL state", 's': scope, 'cname': 'bad'}
+            elif re.match(take_probe_return, event):
+                if has_gil:
+                    for other_pid in has_gil:
+                        gap = time - has_gil[other_pid]
+                        # optimistic overlap would be if we take it from the time it wanted to drop
+                        # gap = time - wants_drop_gil[other_pid]
+                        # print(gap)
+                        if gap < 0: # this many us overlap is ok
+                            # I think it happens when a thread has dropped the GIL, but it has not returned yet
+                            # TODO: we should be able to correct the times
+                            tip = "(If running as giltracer, try passing --no-state-detect to reduce CPU load"
+                            print(f'Anomaly: PID {other_pid} already seems to have the GIL, {overlap} us overlap with {pid} {pid==parent_pid}) {tip}', file=sys.stderr)
+                            # keep this for debugging
+                            # yield header, {"pid": parent_pid, "tid": other_pid, "ts": has_gil[other_pid], "dur": overlap, "name": 'GIL overlap1', "ph": "X", "cat": "process state"}
+                            # yield header, {"pid": parent_pid, "tid": pid, "ts": has_gil[other_pid], "dur": overlap, "name": 'GIL overlap1', "ph": "X", "cat": "process state"}
+                            # yield header, {"pid": parent_pid, "tid": other_pid, "ts": wants_drop_gil[other_pid], "dur": overlap_relaxed, "name": 'GIL overlap2', "ph": "X", "cat": "process state"}
+                            # yield header, {"pid": parent_pid, "tid": pid, "ts": wants_drop_gil[other_pid], "dur": overlap_relaxed, "name": 'GIL overlap2', "ph": "X", "cat": "process state"}
+                has_gil[pid] = time
+                time_wait_gil[pid] += time - max(t_min[pid], wants_take_gil[pid])
+            elif re.match(drop_probe, event):
+                wants_drop_gil[pid] = time
+                scope = "t"  # thread scope
+                yield header, {"pid": parent_pid, "tid": pid, "ts": time, "name": 'drop', "ph": "i", "cat": "GIL state", 's': scope, 'cname': 'bad'}
+            elif re.match(drop_probe_return, event):
+                if pid not in has_gil:
+                    print(f'Anomaly: this PIDs drops the GIL: {pid}, but never took it (maybe we missed it?)', file=sys.stderr)
+                time_gil_take = has_gil.get(pid, time_first)
+                time_gil_drop = time
+                duration = time_gil_drop - time_gil_take
+                time_on_gil[pid] += duration
+                if pid in has_gil:
+                    del has_gil[pid]
+                if duration < duration_min_us:
+                    if verbose >= 2:
+                        print(f'Ignoring {duration}us duration GIL lock', file=sys.stderr)
+                    continue
+
+                args = {'duraction': f'{duration} us'}
+                if show_instant:
+                    # we do both tevent only after drop, so we can ignore 0 duraction event
+                    # TODO: 'flush' out takes without a drop (e.g. perf stopped before drop happned)
+                    name = "GIL-take"
+                    scope = "t"  # thread scope
+                    event = {"pid": parent_pid, "tid": f'{pid}', "ts": time_gil_take, "name": name, "ph": "i", "cat": "GIL state", 's': scope, 'cname': 'terrible'}
+                    yield header, event
+
+                    name = "GIL-drop"
+                    scope = "t"  # thread scope
+                    event = {"pid": parent_pid, "tid": f'{pid}', "ts": time_gil_drop, "name": name, "ph": "i", "cat": "GIL state", 's': scope, 'args': args, 'cname': 'good'}
+                    yield header, event
+
+                if as_async:
+                    begin, end = 'b', 'e'
+
+                else:
+                    begin, end = 'B', 'E'
+                event_id = int(time*1e3)
+                name = "GIL-flow"
+                common = {"pid": parent_pid if as_async else f'{parent_pid}-GIL', "tid": f'{pid}', 'cat': 'GIL state', 'args': args, 'id': event_id, 'cname': 'terrible'}
+                # we may have called take_gil earlier than we got it back
+                # sth like [called take [ take success [still dropping]]]
+                if not only_lock and pid in wants_take_gil:
+                    yield header, {"name": 'GIL(take)', "ph": begin, "ts": wants_take_gil[pid], **common, 'cname': 'bad'}
+                yield header, {"name": 'GIL',   "ph": begin, "ts": time_gil_take + jitter, **common}
+                if not only_lock:
+                    yield header, {"name": 'GIL(drop)',   "ph": begin, "ts": wants_drop_gil[pid], **common}
+                    yield header, {"name": 'GIL(drop)', "ph": end, "ts": time_gil_drop, **common}
+                yield header, {"name": 'GIL', "ph": end, "ts": time_gil_drop, **common}
+                if not only_lock and pid in wants_take_gil:
+                    yield header, {"name": 'GIL(take)', "ph": end, "ts": time_gil_drop, **common, 'cname': 'bad'}
+            else:
+                if event not in ignored:
+                    print(f'ignoring {event}', file=sys.stderr)
+                    ignored.add(event)
+        except:
+            print("error on line", header, file=sys.stderr)
+            raise
+    if verbose >= 1:
+        pids = ["PID"]
+        totals = ["total"]
+        wait = ["wait"]
+        table = []
+        for pid in t_min:
+            total = t_max[pid] - t_min[pid]
+            wait = time_wait_gil[pid]
+            on_gil = time_on_gil[pid]
+            no_gil = total - wait - on_gil
+            row = [pid, total, no_gil/total*100, on_gil/total*100, wait/total * 100]
+            if verbose >= 2:
+                row.extend([no_gil, on_gil, wait])
+            table.append(row)
+        headers = ['PID', 'total(us)', 'no gil%✅', 'has gil%❗', 'gil wait%❌']
+        if verbose:
+            headers.extend(['no gil(us)', 'has gil(us)', 'gil wait(us)'])
+        table = tabulate.tabulate(table, headers)
+        print()
+        print("Summary of threads:")
+        print()
+        print(table)
+        print()
+        print("High 'no gil' is good (✅), we like low 'has gil' (❗),\n and we don't want 'gil wait' (❌).")
+        print()
+
+
 def perf2trace(input, verbose=1, store_runing=False, store_sleeping=True, all_tracepoints=False):
     # useful for debugging, to have the pids a name
     pid_names = {}
@@ -214,7 +406,7 @@ def perf2trace(input, verbose=1, store_runing=False, store_sleeping=True, all_tr
         except BrokenPipeError:
             break
         except:
-            print("error on line", header, other)
+            print("error on line", header, other, file=sys.stderr)
             raise
 
 
diff --git a/per4m/record.py b/per4m/record.py
index 35705ff..3c2e252 100644
--- a/per4m/record.py
+++ b/per4m/record.py
@@ -25,16 +25,23 @@ $ perf-pyrecord -e cycles -m per4m.example2
 """
 
 class PerfRecord:
-    def __init__(self, output='perf.data', args=[], verbose=1):
+    def __init__(self, output='perf.data', args=[], verbose=1, stacktrace=True, buffer_size="128M"):
         self.output = output
+        self.buffer_size = buffer_size
         self.args = args
+        self.stacktrace = stacktrace
         self.verbose = verbose
 
     def __enter__(self):
+        self.start()
+
+    def start(self):
         pid = os.getpid()
         # cmd = f"perf record -e 'sched:*' --call-graph dwarf -k CLOCK_MONOTONIC --pid {pid} -o {self.output}"
         perf_args = ' '.join(self.args)
-        cmd = f"perf record {perf_args} --call-graph dwarf -k CLOCK_MONOTONIC --pid {pid} -o {self.output}"
+        if self.stacktrace:
+            perf_args += " --call-graph dwarf"
+        cmd = f"perf record  {perf_args} -k CLOCK_MONOTONIC --pid {pid} -o {self.output}"
         if self.verbose >= 2:
             print(f"Running: {cmd}")
         args = shlex.split(cmd)
@@ -48,7 +55,7 @@ class PerfRecord:
             # we need to wait till perf creates the file
             time.sleep(0.1)
         else:
-            self.finish()
+            self._finish()
             raise OSError(f'perf did not create {self.output}')
         start_size = os.path.getsize(self.output)
         for _ in range(RETRIES):
@@ -58,13 +65,13 @@ class PerfRecord:
             # we need to wait till perf writes
             time.sleep(0.1)
         else:
-            self.finish()
+            self._finish()
             raise OSError(f'perf did not write to {self.output}')
         # and give perf a bit more time
         time.sleep(0.05)
         return self
 
-    def finish(self):
+    def _finish(self):
         self.perf.terminate()
         outs, errs = self.perf.communicate(timeout=5)
         if self.verbose >= 1:
@@ -77,7 +84,10 @@ class PerfRecord:
             raise OSError(f'perf record fails, got exit code {self.perf.returncode}')
 
     def __exit__(self, *args):
-        self.finish()
+        self.stop()
+
+    def stop(self):
+        self._finish()
         if self.verbose >= 1:
             print('Wait for perf to finish...')
         self.perf.wait()