Hi,
On 2017-06-22 14:08:45 -0700, Andres Freund wrote:
> At pgcon some people were talking about the difficulty of instrumenting
> the time actually spent waiting for lwlocks and related measurements.
> I'd mentioned that linux these days provides infrastructure to measure
> such things in unmodified binaries.
>
> Attached is a prototype of a script that measures the time spent inside
> PGSemaphoreLock(), aggregates that inside the kernel, grouped by pid and
> stacktrace. That allows one to generate nice flame graphs showing which
> part of the code waits how long for lwlocks.
>
> The attached script clearly needs improvements, but I thought I'd show
> some of the results it can get. To run it you need the the python
> library of the 'bcc' project [1], and a sufficiently new kernel. Some
> distributions, e.g. newer debian versions, package this as python-bpfcc
> and similar.
>
> The output of the script can be turned into a flamegraph with
> https://github.com/brendangregg/FlameGraph 's flamegraph.pl.
The script has bitrot slightly, due to python3 and postgres changes (the
move to posix semaphores). Updated version attached.
Based on the discussion in
https://www.postgresql.org/message-id/20200813003934.yrm4qqngfygr6ii7%40alap3.anarazel.de
Greetings,
Andres Freund
#!/usr/bin/python
from __future__ import print_function
from bcc import BPF
from time import sleep
import argparse
import signal
def positive_int(val):
try:
ival = int(val)
except ValueError:
raise argparse.ArgumentTypeError("must be an integer")
if ival < 0:
raise argparse.ArgumentTypeError("must be positive")
return ival
def positive_nonzero_int(val):
ival = positive_int(val)
if ival == 0:
raise argparse.ArgumentTypeError("must be nonzero")
return ival
bpf_text = """
#include <uapi/linux/ptrace.h>
#include <linux/sched.h>
struct stats_key_t {
u32 pid;
int user_stack_id;
};
struct stats_value_t {
u64 total_time;
};
struct start_key_t {
u32 pid;
};
struct start_value_t {
u64 last_start;
};
// map_type, key_type, leaf_type, table_name, num_entry
BPF_HASH(stats, struct stats_key_t, struct stats_value_t);
BPF_HASH(start, struct start_key_t, struct start_value_t);
BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);
int trace_sem_entry(struct pt_regs *ctx)
{
u32 pid = bpf_get_current_pid_tgid();
struct start_key_t start_key = {};
struct start_value_t start_value = {};
if (!(THREAD_FILTER)) {
return 0;
}
start_key.pid = pid;
start_value.last_start = bpf_ktime_get_ns();
start.update(&start_key, &start_value);
return 0;
}
int trace_sem_return(struct pt_regs *ctx)
{
u32 pid = bpf_get_current_pid_tgid();
struct stats_key_t stats_key = {};
struct start_key_t start_key = {};
struct stats_value_t zero = {};
struct stats_value_t *stats_value;
struct start_value_t *start_value;
u64 delta;
if (!(THREAD_FILTER)) {
return 0;
}
start_key.pid = pid;
start_value = start.lookup(&start_key);
if (!start_value)
return 0; /* missed start */;
delta = bpf_ktime_get_ns() - start_value->last_start;
start.delete(&start_key);
stats_key.pid = pid;
stats_key.user_stack_id = stack_traces.get_stackid(ctx, BPF_F_REUSE_STACKID | BPF_F_USER_STACK);
stats_value = stats.lookup_or_init(&stats_key, &zero);
stats_value->total_time += delta;
return 0;
}
"""
examples = """examples:
./pgsemwait.py -x BINARY # trace postgres lwlock wait time until Ctrl-C
./pgsemwait.py -x BINARY 5 # trace for 5 seconds only
./pgsemwait.py -x BINARY -p PID # trace PID
"""
parser = argparse.ArgumentParser(
description="Measure Postgres LWLock Wait Time",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=examples)
parser.add_argument("-x", "--binary", metavar="BINARY", dest="binary", required = True,
help="path to postgres binary")
parser.add_argument("-p", "--pid", metavar="PID", dest="pid",
help="trace this PID only", type=positive_int)
parser.add_argument("-f", "--folded", action="store_true",
help="output folded format")
parser.add_argument("duration", nargs="?", default=99999999,
type=positive_nonzero_int,
help="duration of trace, in seconds")
parser.add_argument("--stack-storage-size", default=1024,
type=positive_nonzero_int,
help="the number of unique stack traces that can be stored and "
"displayed (default 1024)")
args = parser.parse_args()
folded = args.folded
duration = int(args.duration)
# set stack storage size
bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
# setup pid filter
thread_filter = '1'
if args.pid is not None:
thread_filter = 'pid == %d' % args.pid
bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
binary = args.binary
b = BPF(text=bpf_text)
libpath = BPF.find_exe(binary)
if not libpath:
bail("can't resolve library %s" % library)
library = libpath
b.attach_uprobe(name=library, sym_re='PGSemaphoreLock|futex',
fn_name="trace_sem_entry",
pid = -1)
b.attach_uretprobe(name=library, sym_re='PGSemaphoreLock|futex',
fn_name="trace_sem_return",
pid = -1)
matched = b.num_open_uprobes()
if matched == 0:
print("error: 0 functions traced. Exiting.", file=stderr)
exit(1)
# signal handler
def signal_ignore(signal, frame):
print()
signal.signal(signal.SIGINT, signal_ignore)
sleep(duration)
stats = b.get_table("stats")
stack_traces = b.get_table("stack_traces")
for k, v in stats.items():
user_stack = [] if k.user_stack_id < 0 else \
stack_traces.walk(k.user_stack_id)
name = 'postgres'
if v.total_time == 0:
continue
if folded:
# print folded stack output
user_stack = list(user_stack)
line = [name] + \
[b.sym(addr, k.pid).decode() for addr in reversed(user_stack)]
print("%s %d" % (";".join(line), v.total_time))
else:
# print default multi-line stack output
for addr in user_stack:
print(" %s" % b.sym(addr, k.pid).decode())
print(" %-16s %s (%d)" % ("-", name, k.pid))
print(" %d\n" % v.total_time)