People are starting to capture kernel stack traces and produce Flamegraphs. Those traces currently include the frames used by dt(4) itself:
dt_pcb_ring_get+0x11d dt_prov_profile_enter+0x6e hardclock+0x1a9 lapic_clockintr+0x3f Xresume_lapic_ltimer+0x26 acpicpu_idle+0x261 <--- CPU was Idle sched_idle+0x225 proc_trampoline+0x1c 1 Saving the last 5 frames in the example above consumes CPU time and require some post-processing to cleanup gathered data. So the diff below extends the existing stacktrace_save() into and *_at() version that takes an arbitrary frame as argument. The fun start when we ask "How many frames do we skip?". There are currently two different contexts in which dt(4) can gather data: - in hardlock(9), which mean a specialized interrupt context - in any thread or interrupt context Those two contexts require a different number of frames to reach the recording function: dt_pcb_ring_get(). Now what's fun is that every architecture has its own way to reach hardclock(9) (possibly multiple ones), has its own stack unwinder (with specific bugs) and might use different compilers with different optimizations (yeah!). So the diff below introduce two per-arch defines that specify at which frame the unwinding should start. This has been tested on sparc64 and amd64 and here's the result: # btrace -e 'tracepoint:sched:sleep { printf("%s1\n", kstack); }' sleep_setup+0x16c msleep+0x160 taskq_next_work+0x5c taskq_thread+0x38 0x1013074 1 Note that the number of frames for hardclock(9) on amd64 assumes lapic is used. That's why on vmm(4) you'll still see an extra frame as below: # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }' Xresume_legacy0+0x1d3 cpu_idle_cycle+0x1b proc_trampoline+0x1c 1 Whereas on real hardware: # btrace -e 'profile:hz:99 { printf("%s1\n", kstack); }' acpicpu_idle+0x1d2 sched_idle+0x225 proc_trampoline+0x1c 1 The diff below will break compiling dt(4) on the other archs because they don't yet provide stacktrace_save_at(), but it shouldn't be hard to fix ;) Ok? Index: dev/dt/dt_dev.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_dev.c,v retrieving revision 1.4 diff -u -p -r1.4 dt_dev.c --- dev/dt/dt_dev.c 4 Feb 2020 10:56:15 -0000 1.4 +++ dev/dt/dt_dev.c 19 Mar 2020 14:08:53 -0000 @@ -26,29 +26,40 @@ #include <dev/dt/dtvar.h> /* - * How many frames are used by the profiling code? For example - * on amd64: + * The number of frames required to execute dt(4) profiling code + * depends on the probe, context, architecture and possibly the + * compiler. * - * From syscall provider: + * Static probes (tracepoints) are executed in the context of the + * current thread and only need to skip frames up to the recording + *function. For example the syscall provider: * * dt_prov_syscall_entry+0x141 * syscall+0x205 <--- start here * Xsyscall+0x128 * - * From profile provider: + * Probes executed in their own context, like the profile provider, + * need to skip the frames of that context which are different for + * every architecture. For example the profile provider executed + * from hardclock(9) on amd64: * * dt_prov_profile_enter+0x6e - * hardclock+0x12c - * clockintr+0x59 - * intr_handler+0x6e - * Xresume_legacy0+0x1d3 - * cpu_idle_cycle+0x1b <---- start here. + * hardclock+0x1a9 + * lapic_clockintr+0x3f + * Xresume_lapic_ltimer+0x26 + * acpicpu_idle+0x1d2 <---- start here. + * sched_idle+0x225 * proc_trampoline+0x1c */ -#if notyet -#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(4) +#if defined(__amd64__) +#define DT_FA_PROFILE __builtin_frame_address(4) +#define DT_FA_STATIC __builtin_frame_address(1) +#elif defined(__sparc64__) +#define DT_FA_PROFILE __builtin_frame_address(4) +#define DT_FA_STATIC __builtin_frame_address(0) #else -#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(0) +#define DT_FA_STATIC __builtin_frame_address(0) +#define DT_FA_PROFILE DT_FA_STATIC #endif #define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */ @@ -583,7 +594,7 @@ dt_pcb_filter(struct dt_pcb *dp) * Get a reference to the next free event state from the ring. */ struct dt_evt * -dt_pcb_ring_get(struct dt_pcb *dp) +dt_pcb_ring_get(struct dt_pcb *dp, int profiling) { struct proc *p = curproc; struct dt_evt *dtev; @@ -617,11 +628,10 @@ dt_pcb_ring_get(struct dt_pcb *dp) memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1); if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) { -#if notyet - stacktrace_save_at(&dtev->dtev_kstack, DT_HOOK_FRAME_ADDRESS); -#else - stacktrace_save(&dtev->dtev_kstack); -#endif + if (profiling) + stacktrace_save_at(&dtev->dtev_kstack, DT_FA_PROFILE); + else + stacktrace_save_at(&dtev->dtev_kstack, DT_FA_STATIC); } return dtev; Index: dev/dt/dt_prov_profile.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_prov_profile.c,v retrieving revision 1.1 diff -u -p -r1.1 dt_prov_profile.c --- dev/dt/dt_prov_profile.c 21 Jan 2020 16:16:23 -0000 1.1 +++ dev/dt/dt_prov_profile.c 19 Mar 2020 14:06:01 -0000 @@ -107,7 +107,7 @@ dt_prov_profile_fire(struct dt_pcb *dp) if (++dp->dp_nticks < dp->dp_maxtick) return; - dtev = dt_pcb_ring_get(dp); + dtev = dt_pcb_ring_get(dp, 1); if (dtev == NULL) return; dt_pcb_ring_consume(dp, dtev); Index: dev/dt/dt_prov_static.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_prov_static.c,v retrieving revision 1.1 diff -u -p -r1.1 dt_prov_static.c --- dev/dt/dt_prov_static.c 21 Jan 2020 16:16:23 -0000 1.1 +++ dev/dt/dt_prov_static.c 19 Mar 2020 14:06:01 -0000 @@ -120,7 +120,7 @@ dt_prov_static_hook(struct dt_provider * SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { struct dt_evt *dtev; - dtev = dt_pcb_ring_get(dp); + dtev = dt_pcb_ring_get(dp, 0); if (dtev == NULL) continue; Index: dev/dt/dt_prov_syscall.c =================================================================== RCS file: /cvs/src/sys/dev/dt/dt_prov_syscall.c,v retrieving revision 1.1 diff -u -p -r1.1 dt_prov_syscall.c --- dev/dt/dt_prov_syscall.c 21 Jan 2020 16:16:23 -0000 1.1 +++ dev/dt/dt_prov_syscall.c 19 Mar 2020 14:06:01 -0000 @@ -148,7 +148,7 @@ dt_prov_syscall_entry(struct dt_provider SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { struct dt_evt *dtev; - dtev = dt_pcb_ring_get(dp); + dtev = dt_pcb_ring_get(dp, 0); if (dtev == NULL) continue; @@ -190,7 +190,7 @@ dt_prov_syscall_return(struct dt_provide SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { struct dt_evt *dtev; - dtev = dt_pcb_ring_get(dp); + dtev = dt_pcb_ring_get(dp, 0); if (dtev == NULL) continue; Index: dev/dt/dtvar.h =================================================================== RCS file: /cvs/src/sys/dev/dt/dtvar.h,v retrieving revision 1.1 diff -u -p -r1.1 dtvar.h --- dev/dt/dtvar.h 21 Jan 2020 16:16:23 -0000 1.1 +++ dev/dt/dtvar.h 19 Mar 2020 14:06:01 -0000 @@ -194,7 +194,7 @@ void dt_pcb_free(struct dt_pcb *); void dt_pcb_purge(struct dt_pcb_list *); int dt_pcb_filter(struct dt_pcb *); -struct dt_evt *dt_pcb_ring_get(struct dt_pcb *); +struct dt_evt *dt_pcb_ring_get(struct dt_pcb *, int); void dt_pcb_ring_consume(struct dt_pcb *, struct dt_evt *); /* Index: sys/stacktrace.h =================================================================== RCS file: /cvs/src/sys/sys/stacktrace.h,v retrieving revision 1.1 diff -u -p -r1.1 stacktrace.h --- sys/stacktrace.h 20 Jan 2020 15:58:23 -0000 1.1 +++ sys/stacktrace.h 19 Mar 2020 14:06:01 -0000 @@ -28,6 +28,7 @@ struct stacktrace { #ifdef _KERNEL void stacktrace_print(struct stacktrace *, int (*)(const char *, ...)); +void stacktrace_save_at(struct stacktrace *, void *); void stacktrace_save(struct stacktrace *); #endif Index: arch/amd64/amd64/db_trace.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/db_trace.c,v retrieving revision 1.49 diff -u -p -r1.49 db_trace.c --- arch/amd64/amd64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.49 +++ arch/amd64/amd64/db_trace.c 19 Mar 2020 14:06:01 -0000 @@ -256,11 +256,13 @@ db_stack_trace_print(db_expr_t addr, int } void -stacktrace_save(struct stacktrace *st) +stacktrace_save_at(struct stacktrace *st, void *xframe) { - struct callframe *frame, *lastframe; + struct callframe *lastframe, *frame = xframe; + + if (frame == NULL) + frame = __builtin_frame_address(0); - frame = __builtin_frame_address(0); st->st_count = 0; while (st->st_count < STACKTRACE_MAX) { st->st_pc[st->st_count++] = frame->f_retaddr; @@ -275,6 +277,12 @@ stacktrace_save(struct stacktrace *st) if (!INKERNEL(frame->f_retaddr)) break; } +} + +void +stacktrace_save(struct stacktrace *st) +{ + return stacktrace_save_at(st, NULL); } vaddr_t Index: arch/sparc64/sparc64/db_trace.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/sparc64/db_trace.c,v retrieving revision 1.20 diff -u -p -r1.20 db_trace.c --- arch/sparc64/sparc64/db_trace.c 20 Jan 2020 15:58:23 -0000 1.20 +++ arch/sparc64/sparc64/db_trace.c 19 Mar 2020 14:06:01 -0000 @@ -155,7 +155,7 @@ db_stack_trace_print(db_expr_t addr, int } void -stacktrace_save(struct stacktrace *st) +stacktrace_save_at(struct stacktrace *st, void *xframe) { struct frame64 *f64; vaddr_t pc; @@ -163,7 +163,10 @@ stacktrace_save(struct stacktrace *st) write_all_windows(); - frame = (vaddr_t)__builtin_frame_address(0) - BIAS; + if (xframe == NULL) + xframe = __builtin_frame_address(0); + + frame = (vaddr_t)xframe - BIAS; if ((frame & 1) == 0) return; @@ -185,6 +188,11 @@ stacktrace_save(struct stacktrace *st) } } +void +stacktrace_save(struct stacktrace *st) +{ + return stacktrace_save_at(st, NULL); +} void db_dump_window(db_expr_t addr, int have_addr, db_expr_t count, char *modif) {