Hello Tom, Thanks for sharing your work, that's awesome!
On 14/11/20(Sat) 13:13, Tom Rollet wrote: > Here is a diff for dynamic tracing of kernel's functions boundaries. > It's implemented as one of the dt's provider on i386 and amd64. > To activate it, DDBPROF and pseudo device dt must be activated > on GENERIC. Could we replace the DDBPROF define by NDT > 0? Is it still possible to build a profiling kernel if we do so? Being able to use the existing kernel profiling code is a nice way to test prologue patching but if performances are really bad we might want to keep a way to use the old profiling method. If it's too complicated to have both coexist I'd suggest we get rid of DBPROF and use the intr-based patching code for dt(4) only. > For now it's working like DTRACE fbt(function boundaries tracing). > > We replace the prologue and epilogue of each function with a breakpoint. > The replaced instruction on amd64 and 386 are respectively > "push %rbp", "push %ebp" for prologue and "ret", "pop %ebp" for epilogue. > These instructions are emulated at the end of the breakpoint handler, > just after sending an event to userland (to be read by btrace). > > For now the lookup to find the instruction to replace is hardcoded > and needs to find if there is a retguard before the prologue of the > function or if there are multiples int3 after the last ret. > It allows to find 31896 entry points and 19513 return points on amd64. That's a great start. > ddb has also a similar way of tracing all prologue of function. It now > uses this new dt provider for doing tracing. > > Perf wise, when all entry probes are enabled, the slow down is > quite massive but expected since breakpoints are slow. > On the kernel compilation with 10 threads on a linux qemu VM we go from: > real 2m38.280s > user 10m2.050s > sys 14m10.360s > > to: > real 24m19.280s > user 9m44.110s > sys 220m26.980s Did you try on i386 as well? How is it? > Any comments on the diff? See inline. > diff --git a/sys/arch/amd64/amd64/vector.S b/sys/arch/amd64/amd64/vector.S > index dd2dfde3e3b..3a0a58ba3ac 100644 > --- a/sys/arch/amd64/amd64/vector.S > +++ b/sys/arch/amd64/amd64/vector.S > @@ -188,10 +188,11 @@ INTRENTRY_LABEL(trap03): > sti > cld > SMAP_CLAC > - movq %rsp, %rdi > - call _C_LABEL(db_prof_hook) > - cmpl $1, %eax > - jne .Lreal_kern_trap > + leaq _C_LABEL(dt_prov_kprobe), %rdi > + movq %rsp, %rsi > + call _C_LABEL(dt_prov_kprobe_hook) Why don't we call a function with no argument? Maybe the current interface is over designed and we should not use it. > + cmpl $0, %eax > + je .Lreal_kern_trap > > cli > movq TF_RDI(%rsp),%rdi > @@ -210,6 +211,11 @@ INTRENTRY_LABEL(trap03): > movq TF_R11(%rsp),%r11 > /* %rax restored below, after being used to shift the stack */ > > + cmpl $2, %eax > + je .Lemulate_ret > + > +.Lemulate_push_rbp: > + > /* > * We are returning from a probe trap so we need to fix the > * stack layout and emulate the patched instruction. > @@ -217,6 +223,9 @@ INTRENTRY_LABEL(trap03): > */ > subq $16, %rsp > > + movq (TF_RAX + 16)(%rsp), %rax > + movq %rax, TF_RAX(%rsp) > + > /* Shift hardware-saved registers. */ > movq (TF_RIP + 16)(%rsp), %rax > movq %rax, TF_RIP(%rsp) > @@ -237,7 +246,20 @@ INTRENTRY_LABEL(trap03): > > /* Finally restore %rax */ > movq (TF_RAX + 16)(%rsp),%rax > + jmp .ret_int3 Shouldn't we copy the two instructions here instead of jumping? If you're after reducing code duplication you can look at macros like the ones used in frameasm.h. > +.Lemulate_ret: > + > + /* Store a new return address in %rip */ > + movq TF_RSP(%rsp), %rax > + movq (%rax), %rax > + movq %rax, TF_RIP(%rsp) > + addq $8, TF_RSP(%rsp) > + > + /* Finally restore %rax */ > + movq (TF_RAX)(%rsp),%rax > > +.ret_int3: > addq $TF_RIP,%rsp > iretq > #endif /* !defined(GPROF) && defined(DDBPROF) */ > diff --git a/sys/arch/i386/i386/locore.s b/sys/arch/i386/i386/locore.s > index 0c5607fe38a..e2d43b47eb3 100644 > --- a/sys/arch/i386/i386/locore.s > +++ b/sys/arch/i386/i386/locore.s > @@ -205,7 +205,8 @@ INTRENTRY_LABEL(label): /* from kernel */ ; \ > #define INTRFASTEXIT \ > jmp intr_fast_exit > > -#define INTR_FAKE_TRAP 0xbadabada > +#define INTR_FAKE_TRAP_PUSH_RPB 0xbadabada > +#define INTR_FAKE_TRAP_POP_RBP 0xbcbcbcbc > > /* > * PTmap is recursive pagemap at top of virtual address space. > @@ -1259,17 +1260,32 @@ calltrap: > jne .Lreal_trap > > pushl %esp > - call _C_LABEL(db_prof_hook) > - addl $4,%esp > - cmpl $1,%eax > - jne .Lreal_trap > + subl $4, %esp > + pushl %eax > + leal _C_LABEL(dt_prov_kprobe), %eax > + movl %eax, 4(%esp) > + popl %eax > + call _C_LABEL(dt_prov_kprobe_hook) Same comment as above, we can use a f(void) hook, no? > + addl $8,%esp > + cmpl $0,%eax > + je .Lreal_trap > > /* > * Abuse the error field to indicate that INTRFASTEXIT needs > * to emulate the patched instruction. > */ > - movl $INTR_FAKE_TRAP, TF_ERR(%esp) > - jz .Lalltraps_check_asts > + cmpl $1, %eax > + je .Lset_emulate_push_rbp > + > + cmpl $2, %eax > + je .Lset_emulate_ret Instead of returning 0/1/2 you could be returning $INTR_FAKE_TRAP_PUSH_RPB so you could remove many jumps by always moving this to TF_ERR(%esp). > +.Lset_emulate_push_rbp: > + movl $INTR_FAKE_TRAP_PUSH_RPB, TF_ERR(%esp) > + jmp .Lalltraps_check_asts > +.Lset_emulate_ret: > + movl $INTR_FAKE_TRAP_POP_RBP, TF_ERR(%esp) > + jmp .Lalltraps_check_asts > .Lreal_trap: > #endif /* !defined(GPROF) && defined(DDBPROF) */ > pushl %esp > @@ -1298,8 +1314,10 @@ calltrap: > * The code below does that by trashing %eax, so it MUST be > * restored afterward. > */ > - cmpl $INTR_FAKE_TRAP, TF_ERR(%esp) > - je .Lprobe_fixup > + cmpl $INTR_FAKE_TRAP_PUSH_RPB, TF_ERR(%esp) > + je .Lprobe_fixup_push_rbp > + cmpl $INTR_FAKE_TRAP_POP_RBP, TF_ERR(%esp) > + je .Lprobe_fixup_pop_rbp > #endif /* !defined(GPROF) && defined(DDBPROF) */ > #ifndef DIAGNOSTIC > INTRFASTEXIT > @@ -1327,7 +1345,7 @@ spl_lowered: > > .text > #if !defined(GPROF) && defined(DDBPROF) > -.Lprobe_fixup: > +.Lprobe_fixup_push_rbp: > /* Restore all register unwinding the stack. */ > INTR_RESTORE_ALL > > @@ -1352,6 +1370,26 @@ spl_lowered: > > popl %eax > iret > +.Lprobe_fixup_pop_rbp: > + /* Restore all register unwinding the stack. */ > + INTR_RESTORE_ALL > + > + movl %eax,0(%esp) > + > + /* pop %ebp */ > + movl 20(%esp), %ebp > + /* Shift hardware-saved registers: eflags, cs, eip */ > + movl 16(%esp), %eax > + movl %eax, 20(%esp) > + movl 12(%esp), %eax > + movl %eax, 16(%esp) > + movl 8(%esp), %eax > + movl %eax, 12(%esp) > + > + /* Pop eax and restore the stack pointer */ > + popl %eax > + addl $8, %esp > + iret > #endif /* !defined(GPROF) && defined(DDBPROF) */ > > .text > diff --git a/sys/ddb/db_prof.c b/sys/ddb/db_prof.c > index 7d8190f41bc..8b3e52b8c9c 100644 > --- a/sys/ddb/db_prof.c > +++ b/sys/ddb/db_prof.c ^^^^^^^^ You can also remove my copyright from this file since you moved all the code that I wrote to dt_prov_kprobe.c, all what remains is a copy of the mcount() function. > @@ -232,6 +117,8 @@ db_prof_count(unsigned long frompc, unsigned long selfpc) > if (p->state != GMON_PROF_ON) > return; > > + frompc = db_get_pc(frame); > + selfpc = db_get_probe_addr(frame); > /* > * check that frompcindex is a reasonable pc value. > * for example: signal catchers get called from the stack, > @@ -241,6 +128,7 @@ db_prof_count(unsigned long frompc, unsigned long selfpc) > if (frompc > p->textsize) > return; > > + ^^ This change is unnecessary ;) > #if (HASHFRACTION & (HASHFRACTION - 1)) == 0 > if (p->hashfraction == HASHFRACTION) > frompcindex = > diff --git a/sys/dev/dt/dt_dev.c b/sys/dev/dt/dt_dev.c > index 51dc5e1cd81..64889ac8225 100644 > --- a/sys/dev/dt/dt_dev.c > +++ b/sys/dev/dt/dt_dev.c > @@ -122,7 +122,7 @@ int dt_ioctl_get_stats(struct dt_softc *, struct > dtioc_stat *); > int dt_ioctl_record_start(struct dt_softc *); > void dt_ioctl_record_stop(struct dt_softc *); > int dt_ioctl_probe_enable(struct dt_softc *, struct dtioc_req *); > -void dt_ioctl_probe_disable(struct dt_softc *, struct dtioc_req *); > +int dt_ioctl_probe_disable(struct dt_softc *, struct dtioc_req *); > > int dt_pcb_ring_copy(struct dt_pcb *, struct dt_evt *, size_t, uint64_t > *); > > @@ -136,6 +136,7 @@ dtattach(struct device *parent, struct device *self, void > *aux) > dt_nprobes += dt_prov_profile_init(); > dt_nprobes += dt_prov_syscall_init(); > dt_nprobes += dt_prov_static_init(); > + dt_nprobes += dt_prov_kprobe_init(); The kprobe provider relies on DDB symbols, no? If so we should keep all its code under #ifdef DDB. > printf("dt: %u probes\n", dt_nprobes); > } > @@ -276,6 +277,7 @@ dtioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, > struct proc *p) > return dt_ioctl_get_stats(sc, (struct dtioc_stat *)addr); > case DTIOCRECORD: > case DTIOCPRBENABLE: > + case DTIOCPRBDISABLE: > /* root only ioctl(2) */ > break; > default: > @@ -296,6 +298,9 @@ dtioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, > struct proc *p) > case DTIOCPRBENABLE: > error = dt_ioctl_probe_enable(sc, (struct dtioc_req *)addr); > break; > + case DTIOCPRBDISABLE: > + error = dt_ioctl_probe_disable(sc, (struct dtioc_req *)addr); > + break; > default: > KASSERT(0); > } > @@ -479,6 +484,36 @@ dt_ioctl_probe_enable(struct dt_softc *sc, struct > dtioc_req *dtrq) > return 0; > } > > +int > +dt_ioctl_probe_disable(struct dt_softc *sc, struct dtioc_req *dtrq) > +{ > + struct dt_probe *dtp; > + int error; > + > + KASSERT(suser(curproc) == 0); > + if (!dtioc_req_isvalid(dtrq)) > + return EINVAL; > + > + SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) { > + if (dtp->dtp_pbn == dtrq->dtrq_pbn) > + break; > + } > + if (dtp == NULL) > + return ENOENT; > + > + if (dtp->dtp_prov->dtpv_desalloc) > + { > + error = dtp->dtp_prov->dtpv_desalloc(dtp, sc, dtrq); > + if (error) > + return error; > + } > + > + DPRINTF("dt%d: pid %d desalloc\n", sc->ds_unit, sc->ds_pid, > + dtrq->dtrq_pbn); > + > + return 0; > +} Having a new ioctl(2) and a new step in the setup/teardown process might be indeed need. In any case we must also ensure that functions are correctly unpatched even if the ioctl(2) hasn't been issued when dt(4) is closed. Can you explain when are functions patched/unpatched and why you choose to do it this way? Did you consider patching/unlatching just before starting/ending the recording? > struct dt_probe * > dt_dev_alloc_probe(const char *func, const char *name, struct dt_provider > *dtpv) > { > @@ -493,6 +528,9 @@ dt_dev_alloc_probe(const char *func, const char *name, > struct dt_provider *dtpv) > dtp->dtp_func = func; > dtp->dtp_name = name; > dtp->dtp_sysnum = -1; > + dtp->dtp_ref = 0; > + > + mtx_init(&dtp->dtp_mtx, IPL_HIGH); Do we need the mutex for anything but reference counting? Can we use atomic operations like it is done in other subsystems of the kernel? Does this field need to be in the generic `dtp' structure or can the reference counting be kept in the kprobe-specific descriptor? > return dtp; > } > diff --git a/sys/dev/dt/dt_prov_kprobe.c b/sys/dev/dt/dt_prov_kprobe.c > new file mode 100644 > index 00000000000..bd7c06e5680 > --- /dev/null > +++ b/sys/dev/dt/dt_prov_kprobe.c > @@ -0,0 +1,497 @@ > +/* > + * Copyright (c) 2020 Tom Rollet <tom.rol...@epita.fr> > + * > + * Permission to use, copy, modify, and distribute this software for any > + * purpose with or without fee is hereby granted, provided that the above > + * copyright notice and this permission notice appear in all copies. > + * > + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES > + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF > + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR > + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES > + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN > + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF > + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. > + */ > + > +#include <sys/types.h> > +#include <sys/systm.h> > +#include <sys/param.h> > +#include <sys/malloc.h> > +#include <sys/atomic.h> > +#include <sys/exec_elf.h> > + > +#include <ddb/db_elf.h> > +#include <machine/db_machdep.h> > +#include <ddb/db_extern.h> > +#include <ddb/db_access.h> > +#include <ddb/db_sym.h> > +#include <ddb/db_interface.h> > + > +#include <dev/dt/dtvar.h> > + > +int > +dt_prov_kprobe_alloc(struct dt_probe *dtp, struct dt_softc *sc, > + struct dt_pcb_list *plist, struct dtioc_req *dtrq); > +int dt_prov_kprobe_hook(struct dt_provider *dtpv, ...); > +int > +dt_prov_kprobe_desalloc(struct dt_probe *dtp, struct dt_softc *sc, > + struct dtioc_req *dtrq); Please look at style(9) and how the other dt* files look like they are many style inconsistency in this file but I won't pinpoint any of them in this reviews to not introduce more noise ;) > +void db_prof_count(struct trapframe *frame); > + > + > +struct kprobe_probe > +{ > + struct dt_probe* dtp; > + SLIST_ENTRY(kprobe_probe) kprobe_next; > +}; Could we add the reference counting I mentioned earlier in this structure? > + > +uint32_t hash_tmp( uint32_t a) > +{ > + a = (a+0x7ed55d16) + (a<<12); > + a = (a^0xc761c23c) ^ (a>>19); > + a = (a+0x165667b1) + (a<<5); > + a = (a+0xd3a2646c) ^ (a<<9); > + a = (a+0xfd7046c5) + (a<<3); > + a = (a^0xb55a4f09) ^ (a>>16); > + return a; > +} Where does this hash comes from? Why did you pick this one? This is worth a comment. > + > +#define PPTSIZE PAGE_SIZE * 30 > +#define PPTMASK ((PPTSIZE / sizeof(struct kprobe_probe)) - 1) > +#define INSTTOIDX(inst) ( hash_tmp(inst) & PPTMASK) > + > +SLIST_HEAD(, kprobe_probe) *dtpf_entry; > +SLIST_HEAD(, kprobe_probe) *dtpf_return; > +int nb_probes_entry = 0; > +int nb_probes_return = 0; > + > + > +#define DTEVT_PROV_KPROBE (DTEVT_COMMON|DTEVT_FUNCARGS) > + > +#define KPROBE_ENTRY "entry" > +#define KPROBE_RETURN "return" > + > +#if defined(__amd64__) > +#define KPROBE_RETGUARD_MOV_1 0x4c > +#define KPROBE_RETGUARD_MOV_2 0x8b > +#define KPROBE_RETGUARD_MOV_3 0x1d > + > +#define KPROBE_RETGUARD_MOV_SIZE 7 > + > +#define KPROBE_RETGUARD_XOR_1 0x4c > +#define KPROBE_RETGUARD_XOR_2 0x33 > +#define KPROBE_RETGUARD_XOR_3 0x1c > + > +#define KPROBE_RETGUARD_XOR_SIZE 4 > + > +#define RET 0xc3 > +#define RET_SIZE 1 > + > +#elif defined(__i386__) > +#define POP_RBP 0x5d > +#define POP_RBP_SIZE 1 The idea of using a define is also to reduce the #ifdef maze below. So if RET is not a good name pick another one :o) > +#endif > + > +struct dt_provider dt_prov_kprobe = { > + .dtpv_name = "kprobe", > + .dtpv_alloc = dt_prov_kprobe_alloc, > + .dtpv_enter = dt_prov_kprobe_hook, > + .dtpv_leave = NULL, > + .dtpv_desalloc = dt_prov_kprobe_desalloc, > +}; > + > +extern db_symtab_t db_symtab; > +extern char __kutext_end[]; > +extern int db_prof_on; > + > +/* Initialize all entry and return probes and store them in global arrays */ > + int > +dt_prov_kprobe_init(void) > +{ > +#if defined(__amd64__) || defined(__i386__) > + struct dt_probe *dtp; > + struct kprobe_probe *kprobe_dtp; > + Elf_Sym *symp, *symtab_start, *symtab_end; > + char *strtab, *name; > + vaddr_t inst, limit; > + int nb_sym, nb_probes; > + > + > + nb_sym = (db_symtab.end - db_symtab.start) / sizeof (Elf_Sym); > + nb_probes = nb_probes_entry = nb_probes_return = 0; > + > + dtpf_entry = malloc(PPTSIZE, M_DT, M_NOWAIT|M_ZERO); > + if (dtpf_entry == NULL) > + goto end; Please do not print additional message. simply return here. The probes will be available via "btrace -l" > + dtpf_return = malloc(PPTSIZE, M_DT, M_NOWAIT|M_ZERO); > + if (dtpf_return == NULL) > + goto end; > + > + db_symtab_t *stab = &db_symtab; > + > + symtab_start = STAB_TO_SYMSTART(stab); > + symtab_end = STAB_TO_SYMEND(stab); > + > + strtab = db_elf_find_strtab(stab); > + > + for (symp = symtab_start; symp < symtab_end; symp++) { > + if (ELF_ST_TYPE(symp->st_info) != STT_FUNC) > + continue; > + > + inst = symp->st_value; > + name = strtab + symp->st_name; > + limit = symp->st_value + symp->st_size; > + > + /* Filter function that are not mapped in memory */ > + if (inst < KERNBASE || inst >= (vaddr_t)&__kutext_end) > + continue; > + > + /* Remove some function to avoid recursive tracing */ > + if (strncmp(name, "dt_", 3) == 0 || strncmp(name, "trap", 4) == 0 || > + strncmp(name, "db_", 3) == 0) > + continue; > + > +#if defined(__amd64__) > + /* Find if there is a retguard, if so move the inst pointer to the > later 'push rbp' */ Maybe introduce a function taking a "uint8_t *" as argument then we'll have different implementation per architecture. This should reduce the #ifdef dance and the indentation level :o) > + if (*((uint8_t *)inst) != SSF_INST) { > + /* No retguards in i386 */ > + if (((uint8_t *)inst)[0] != KPROBE_RETGUARD_MOV_1 || > + ((uint8_t *)inst)[1] != KPROBE_RETGUARD_MOV_2 || > + ((uint8_t *)inst)[2] != KPROBE_RETGUARD_MOV_3 || > + ((uint8_t *)inst)[KPROBE_RETGUARD_MOV_SIZE] != > KPROBE_RETGUARD_XOR_1 || > + ((uint8_t *)inst)[KPROBE_RETGUARD_MOV_SIZE + 1] != > KPROBE_RETGUARD_XOR_2 || > + ((uint8_t *)inst)[KPROBE_RETGUARD_MOV_SIZE + 2] != > KPROBE_RETGUARD_XOR_3 || > + ((uint8_t *)inst)[KPROBE_RETGUARD_MOV_SIZE + > KPROBE_RETGUARD_XOR_SIZE] != SSF_INST) > + continue; > + inst = (vaddr_t)&(((uint8_t *)inst)[KPROBE_RETGUARD_MOV_SIZE + > KPROBE_RETGUARD_XOR_SIZE]); > + } > +#elif defined(__i386__) > + if (*((uint8_t *)inst) != SSF_INST) > + continue; > +#endif > + > + dtp = dt_dev_alloc_probe(name, KPROBE_ENTRY, &dt_prov_kprobe); How may probes do you have on your system with this? We might want to use a pool for allocating `dtp' then. But that's for a different diff. > + if (dtp == NULL) > + goto end; > + > + kprobe_dtp = malloc(sizeof(struct kprobe_probe), M_TEMP, > M_NOWAIT|M_ZERO); > + if (kprobe_dtp == NULL) > + goto end; Why do we need two descriptors? Could we fold the two descriptors into a single allocation instead of linking them? Would that have any drawback? We could use the common OO magic, something like: struct dt_kprobe { struct dt_probe dtk_dtp; SLIST_ENTRY(dt_kprobe) dtk_next; /* [I] link in the hash table */ unsigned int dtk_ref; /* [a] # of /dev/dt consumers */ } > + kprobe_dtp->dtp = dtp; > + > + dtp->dtp_addr = inst; > + dtp->dtp_nargs = db_ctf_func_numargs(symp); > + dt_dev_register_probe(dtp); > + > + SLIST_INSERT_HEAD(&dtpf_entry[INSTTOIDX(dtp->dtp_addr)], kprobe_dtp, > kprobe_next); > + > + nb_probes++; > + nb_probes_entry++; > + > + > + /* > + * Poor method to find the return point > + * => we would need a disassembler to find all return points > + * For now we start from the end of the function, iterate on > + * int3 inserted for retguard until we find a ret > + */ Same here, let's use a function then we can decide if we want to turn its implementation using a disassembler :) > +#if defined(__amd64__) > + if (*(uint8_t *)(limit - 1) != RET) > + continue; > + inst = limit - 1; > +#elif defined(__i386__) > + /* > + * Little temporary hack to find some return probe > + * => always int3 after 'pop %rpb; ret' > + */ > + while(*((uint8_t *)inst) == 0xcc) > + (*(uint8_t *)inst) -= 1; > + if (*(uint8_t *)(limit - 2) != POP_RBP) > + continue; > + inst = limit - 2; > +#endif > + > + dtp = dt_dev_alloc_probe(name, KPROBE_RETURN, &dt_prov_kprobe); > + if (dtp == NULL) > + goto end; > + > + kprobe_dtp = malloc(sizeof(struct kprobe_probe), M_TEMP, > M_NOWAIT|M_ZERO); > + if (kprobe_dtp == NULL) > + goto end; > + kprobe_dtp->dtp = dtp; > + > + dtp->dtp_addr = inst; > + dt_dev_register_probe(dtp); > + SLIST_INSERT_HEAD(&dtpf_return[INSTTOIDX(dtp->dtp_addr)], kprobe_dtp, > kprobe_next); > + nb_probes++; > + nb_probes_return++; > + } > +#endif > +end: > + printf("dt kprobe: entry: %d return: %d\n", nb_probes_entry, > nb_probes_return); > + return nb_probes; > +} > + > + int > +dt_prov_kprobe_alloc(struct dt_probe *dtp, struct dt_softc *sc, > + struct dt_pcb_list *plist, struct dtioc_req *dtrq) > +{ > +#if defined(__amd64__) || defined(__i386__) I'd say the keep the define for the whole provider, no need to advertise it for !x86 as long as there's no support :o) This function is nicely MI! > + uint8_t patch = BKPT_INST; > + struct dt_pcb *dp; > + unsigned s; > + > + dp = dt_pcb_alloc(dtp, sc); > + if (dp == NULL) > + return ENOMEM; > + > + /* Patch only if it's first pcb referencing this probe */ > + mtx_enter(&dtp->dtp_mtx); > + dtp->dtp_ref++; > + assert(dtp->dtp_ref != 0); We use KASSERT() in the kernel :) > + > + if (dtp->dtp_ref == 1) { > + s = intr_disable(); > + db_write_bytes(dtp->dtp_addr, BKPT_SIZE, &patch); > + intr_restore(s); Is it safe to call db_write_bytes() w/o KERNEL_LOCK()? If not please add an KERNEL_LOCK_ASSERT() above :) > + } > + mtx_leave(&dtp->dtp_mtx); > + > + dp->dp_filter = dtrq->dtrq_filter; > + dp->dp_evtflags = dtrq->dtrq_evtflags & DTEVT_PROV_KPROBE; > + TAILQ_INSERT_HEAD(plist, dp, dp_snext); As mentioned earlier, I don't know if we should patch the code when allocating the dtp. Others options could be when we enable the tracing or in between the two. > + return 0; > + > +#else > + return ENOENT; > +#endif > +} > + > + int > +dt_prov_kprobe_desalloc(struct dt_probe *dtp, struct dt_softc *sc, > + struct dtioc_req *dtrq) > +{ > +#if defined(__amd64__) || defined(__i386__) > + uint8_t patch; > + int size; > + unsigned s; > + > + if (strcmp(dtp->dtp_name, KPROBE_ENTRY) == 0) > + { > + patch = SSF_INST; > + size = SSF_SIZE; > + } > + else if (strcmp(dtp->dtp_name, KPROBE_RETURN) == 0) > + { > +#if defined(__amd64__) > + patch = RET; > + size = RET_SIZE; > +#elif defined(__i386__) > + patch = POP_RBP; > + size = POP_RBP_SIZE; > +#endif > + } > + else > + assert(0 && "Trying to desalloc not yet implemented probe type"); Maybe use a per-architecture function returning patch & size? > + mtx_enter(&dtp->dtp_mtx); > + dtp->dtp_ref--; > + > + if (dtp->dtp_ref == 0) { > + s = intr_disable(); > + db_write_bytes(dtp->dtp_addr, size, &patch); > + intr_restore(s); > + } > + > + mtx_leave(&dtp->dtp_mtx); > + > + /* Desallocation of pcb is done by dt_pcb_purge while closing the dev */ > + return 0; > + > +#else > + return ENOENT; > +#endif > +} > + > + int > +dt_prov_kprobe_hook(struct dt_provider *dtpv, ...) > +{ > +#if defined(__amd64__) || defined(__i386__) > + struct dt_probe *dtp; > + struct dt_pcb *dp; > + va_list ap; > + struct trapframe *tf; > + int is_dt_bkpt = 0; > + struct kprobe_probe *kprobe_dtp; > + /* Arguments for entry probes */ > + vaddr_t *args; > + size_t argsize; > + /* Return values for return probes*/ > + int error; > + register_t retval[2]; > + > + KASSERT(dtpv == &dt_prov_kprobe); > + > + va_start(ap, dtpv); > + tf = va_arg(ap, struct trapframe*); > + va_end(ap); > + > +#if defined(__amd64__) > + vaddr_t addr = tf->tf_rip - BKPT_SIZE; > +#elif defined(__i386) > + vaddr_t addr = tf->tf_eip - BKPT_SIZE; > +#endif This looks like db_get_probe_addr() ;) > + > + > + SLIST_FOREACH(kprobe_dtp, &dtpf_entry[INSTTOIDX(addr)], kprobe_next) { > + dtp = kprobe_dtp->dtp; > + > + if (dtp->dtp_addr != addr) > + continue; > + > + is_dt_bkpt = 1; > + if (db_prof_on) > + db_prof_count(tf); > + > + if (!dtp->dtp_recording) > + continue; > + > + smr_read_enter(); > + SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { > + struct dt_evt *dtev; > + > + dtev = dt_pcb_ring_get(dp, 0); > + if (dtev == NULL) > + continue; > + > +#if defined(__amd64__) > + args = (vaddr_t *)tf->tf_rdi; > + argsize = 6; /* Only 6 first args are passed on registers */ > +#elif defined(__i386) > + /* All args on stack */ > + args = (vaddr_t *)(tf->tf_esp + 4); > + argsize = 10; > +#endif Could we introduce a function for this? Could it be shared with the stack unwinding/printing code of ddb? > + if (ISSET(dp->dp_evtflags, DTEVT_FUNCARGS)) > + memcpy(dtev->dtev_args, args, argsize); > + > + dt_pcb_ring_consume(dp, dtev); > + } > + smr_read_leave(); > + } > + > + if (is_dt_bkpt) > + return is_dt_bkpt; > + > + SLIST_FOREACH(kprobe_dtp, &dtpf_return[INSTTOIDX(addr)], kprobe_next) { > + dtp = kprobe_dtp->dtp; > + > + if (dtp->dtp_addr != addr) > + continue; > + > + is_dt_bkpt = 2; > + > + if (!dtp->dtp_recording) > + continue; > + > + smr_read_enter(); > + SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { > + struct dt_evt *dtev; > + > + dtev = dt_pcb_ring_get(dp, 0); > + if (dtev == NULL) > + continue; > + > +#if defined(__amd64__) > + retval[0] = tf->tf_rax; > + retval[1] = 0; > + error = 0; > +#elif defined(__i386) > + retval[0] = tf->tf_eax; > + retval[1] = 0; > + error = 0; > +#endif Same here :) > + > + dtev->dtev_retval[0] = retval[0]; > + dtev->dtev_retval[1] = retval[1]; > + dtev->dtev_error = error; > + > + dt_pcb_ring_consume(dp, dtev); > + } > + smr_read_leave(); > + } > + return is_dt_bkpt; > +#else > + return 0; > +#endif > +} > + > +/* Function called by ddb to patch all functions without allocating 1 pcb > per probe */ > +void dt_prov_kprobe_patch_all_entry(void) > +{ > +#if defined(__amd64__) || defined(__i386__) This is nicely MI as well :) > + uint8_t patch = BKPT_INST; > + struct dt_probe *dtp; > + struct kprobe_probe *kprobe_dtp; > + size_t i; > + > + > + > + for (i = 0; i < PPTMASK; ++i) > + { > + SLIST_FOREACH(kprobe_dtp, &dtpf_entry[i], kprobe_next) { > + dtp = kprobe_dtp->dtp; > + > + mtx_enter(&dtp->dtp_mtx); > + dtp->dtp_ref++; > + > + if (dtp->dtp_ref == 1) { > + unsigned s; > + s = intr_disable(); > + db_write_bytes(dtp->dtp_addr, BKPT_SIZE, &patch); > + intr_restore(s); > + } > + > + mtx_leave(&dtp->dtp_mtx); > + } > + } > +#endif > +} > + > +/* Function called by ddb to patch all functions without allocating 1 pcb > per probe */ > +void dt_prov_kprobe_depatch_all_entry(void) > +{ > +#if defined(__amd64__) || defined(__i386__) Same here. > + uint8_t patch = SSF_INST; > + struct dt_probe *dtp; > + struct kprobe_probe *kprobe_dtp; > + > + size_t i; > + > + for (i = 0; i < PPTMASK; ++i) > + { > + SLIST_FOREACH(kprobe_dtp, &dtpf_entry[i], kprobe_next) { > + dtp = kprobe_dtp->dtp; > + > + mtx_enter(&dtp->dtp_mtx); > + dtp->dtp_ref--; > + > + if (dtp->dtp_ref == 0) { > + unsigned s; > + s = intr_disable(); > + db_write_bytes(dtp->dtp_addr, SSF_SIZE, &patch); > + intr_restore(s); > + } > + > + mtx_leave(&dtp->dtp_mtx); > + } > + > + } > + > +#else > +#endif > +} > diff --git a/sys/dev/dt/dt_prov_profile.c b/sys/dev/dt/dt_prov_profile.c > index 60d59af54f7..4cd48b5f676 100644 > --- a/sys/dev/dt/dt_prov_profile.c > +++ b/sys/dev/dt/dt_prov_profile.c > @@ -31,14 +31,15 @@ struct dt_probe *dtpp_interval; /* global > periodic probe */ > > int dt_prov_profile_alloc(struct dt_probe *, struct dt_softc *, > struct dt_pcb_list *, struct dtioc_req *); > -void dt_prov_profile_enter(struct dt_provider *, ...); > -void dt_prov_interval_enter(struct dt_provider *, ...); > +int dt_prov_profile_enter(struct dt_provider *, ...); > +int dt_prov_interval_enter(struct dt_provider *, ...); > > struct dt_provider dt_prov_profile = { > .dtpv_name = "profile", > .dtpv_alloc = dt_prov_profile_alloc, > .dtpv_enter = dt_prov_profile_enter, > .dtpv_leave = NULL, > + .dtpv_desalloc = NULL, > }; > > struct dt_provider dt_prov_interval = { > @@ -46,6 +47,7 @@ struct dt_provider dt_prov_interval = { > .dtpv_alloc = dt_prov_profile_alloc, > .dtpv_enter = dt_prov_interval_enter, > .dtpv_leave = NULL, > + .dtpv_desalloc = NULL, > }; > > int > @@ -114,7 +116,7 @@ dt_prov_profile_fire(struct dt_pcb *dp) > dp->dp_nticks = 0; > } > > -void > +int > dt_prov_profile_enter(struct dt_provider *dtpv, ...) > { > struct cpu_info *ci = curcpu(); > @@ -130,9 +132,10 @@ dt_prov_profile_enter(struct dt_provider *dtpv, ...) > dt_prov_profile_fire(dp); > } > smr_read_leave(); > + return 0; > } > > -void > +int > dt_prov_interval_enter(struct dt_provider *dtpv, ...) > { > struct dt_pcb *dp; > @@ -144,4 +147,5 @@ dt_prov_interval_enter(struct dt_provider *dtpv, ...) > dt_prov_profile_fire(dp); > } > smr_read_leave(); > + return 0; > } > diff --git a/sys/dev/dt/dt_prov_static.c b/sys/dev/dt/dt_prov_static.c > index d7e10542a0c..08d12940236 100644 > --- a/sys/dev/dt/dt_prov_static.c > +++ b/sys/dev/dt/dt_prov_static.c > @@ -25,12 +25,13 @@ > > int dt_prov_static_alloc(struct dt_probe *, struct dt_softc *, > struct dt_pcb_list *, struct dtioc_req *); > -void dt_prov_static_hook(struct dt_provider *, ...); > +int dt_prov_static_hook(struct dt_provider *, ...); > > struct dt_provider dt_prov_static = { > .dtpv_name = "tracepoint", > .dtpv_alloc = dt_prov_static_alloc, > .dtpv_enter = dt_prov_static_hook, > + .dtpv_desalloc = NULL, > }; > > /* > @@ -120,7 +121,7 @@ dt_prov_static_alloc(struct dt_probe *dtp, struct > dt_softc *sc, > return 0; > } > > -void > +int > dt_prov_static_hook(struct dt_provider *dtpv, ...) > { > struct dt_probe *dtp; > @@ -146,13 +147,14 @@ dt_prov_static_hook(struct dt_provider *dtpv, ...) > if (dtev == NULL) > continue; > > - dtev->dtev_sysargs[0] = args[0]; > - dtev->dtev_sysargs[1] = args[1]; > - dtev->dtev_sysargs[2] = args[2]; > - dtev->dtev_sysargs[3] = args[3]; > - dtev->dtev_sysargs[4] = args[4]; > + dtev->dtev_args[0] = args[0]; > + dtev->dtev_args[1] = args[1]; > + dtev->dtev_args[2] = args[2]; > + dtev->dtev_args[3] = args[3]; > + dtev->dtev_args[4] = args[4]; This change could already go in. Could you extract the diff and I'll commit it. > > dt_pcb_ring_consume(dp, dtev); > } > smr_read_leave(); > + return 1; > } > diff --git a/sys/dev/dt/dt_prov_syscall.c b/sys/dev/dt/dt_prov_syscall.c > index b2e82551929..0c1c23b290a 100644 > --- a/sys/dev/dt/dt_prov_syscall.c > +++ b/sys/dev/dt/dt_prov_syscall.c > @@ -37,7 +37,7 @@ unsigned int dtps_nsysent = SYS_MAXSYSCALL; > > int dt_prov_syscall_alloc(struct dt_probe *, struct dt_softc *, > struct dt_pcb_list *, struct dtioc_req *); > -void dt_prov_syscall_entry(struct dt_provider *, ...); > +int dt_prov_syscall_entry(struct dt_provider *, ...); > void dt_prov_syscall_return(struct dt_provider *, ...); > > struct dt_provider dt_prov_syscall = { > @@ -45,6 +45,7 @@ struct dt_provider dt_prov_syscall = { > .dtpv_alloc = dt_prov_syscall_alloc, > .dtpv_enter = dt_prov_syscall_entry, > .dtpv_leave = dt_prov_syscall_return, > + .dtpv_desalloc = NULL, > }; > > int > @@ -119,7 +120,7 @@ dt_prov_syscall_alloc(struct dt_probe *dtp, struct > dt_softc *sc, > return 0; > } > > -void > +int > dt_prov_syscall_entry(struct dt_provider *dtpv, ...) > { > struct dt_probe *dtp; > @@ -136,14 +137,14 @@ dt_prov_syscall_entry(struct dt_provider *dtpv, ...) > args = va_arg(ap, register_t*); > va_end(ap); > > - KASSERT((argsize / sizeof(register_t)) <= DTMAXSYSARGS); > + KASSERT((argsize / sizeof(register_t)) <= DTMAXFUNCARGS); This could go in the same renaming diff. > > if (sysnum < 0 || sysnum >= dtps_nsysent) > - return; > + return 0; > > dtp = dtps_entry[sysnum]; > if (!dtp->dtp_recording) > - return; > + return 0; > > smr_read_enter(); > SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { > @@ -154,11 +155,12 @@ dt_prov_syscall_entry(struct dt_provider *dtpv, ...) > continue; > > if (ISSET(dp->dp_evtflags, DTEVT_FUNCARGS)) > - memcpy(dtev->dtev_sysargs, args, argsize); > + memcpy(dtev->dtev_args, args, argsize); > > dt_pcb_ring_consume(dp, dtev); > } > smr_read_leave(); > + return 0; > } > > void > @@ -196,13 +198,13 @@ dt_prov_syscall_return(struct dt_provider *dtpv, ...) > continue; > > if (error) { > - dtev->dtev_sysretval[0] = -1; > - dtev->dtev_sysretval[1] = 0; > - dtev->dtev_syserror = error; > + dtev->dtev_retval[0] = -1; > + dtev->dtev_retval[1] = 0; > + dtev->dtev_error = error; > } else { > - dtev->dtev_sysretval[0] = retval[0]; > - dtev->dtev_sysretval[1] = retval[1]; > - dtev->dtev_syserror = 0; > + dtev->dtev_retval[0] = retval[0]; > + dtev->dtev_retval[1] = retval[1]; > + dtev->dtev_error = 0; And this too :) > } > > dt_pcb_ring_consume(dp, dtev); > diff --git a/sys/dev/dt/dtvar.h b/sys/dev/dt/dtvar.h > index 3e7985692eb..6d2dd1a8110 100644 > --- a/sys/dev/dt/dtvar.h > +++ b/sys/dev/dt/dtvar.h > @@ -34,9 +34,9 @@ > #define DTMAXCOMLEN 16 > > /* > - * Maximum number of arguments passed to a syscall. > + * Maximum number of arguments passed to a function. > */ > -#define DTMAXSYSARGS 10 > +#define DTMAXFUNCARGS 10 > > /* > * Event state: where to store information when a probe fires. > @@ -54,15 +54,15 @@ struct dt_evt { > struct stacktrace dtev_kstack; /* kernel stack frame */ > char dtev_comm[DTMAXCOMLEN+1]; /* current pr. name */ > union { > - register_t E_entry[DTMAXSYSARGS]; > + register_t E_entry[DTMAXFUNCARGS]; > struct { > register_t __retval[2]; > int __error; > } E_return; > - } _sys; > -#define dtev_sysargs _sys.E_entry /* syscall args. */ > -#define dtev_sysretval _sys.E_return.__retval /* syscall retval */ > -#define dtev_syserror _sys.E_return.__error /* syscall error */ > + } _args; > +#define dtev_args _args.E_entry /* function args. */ > +#define dtev_retval _args.E_return.__retval /* function retval */ > +#define dtev_error _args.E_return.__error /* function error */ As well as those two chunks. > > }; > > @@ -132,6 +132,7 @@ struct dtioc_stat { > > #define DTIOCRECORD _IOW('D', 3, int) > #define DTIOCPRBENABLE _IOW('D', 4, struct dtioc_req) > +#define DTIOCPRBDISABLE _IOW('D', 5, struct dtioc_req) > > > #ifdef _KERNEL > @@ -213,12 +214,16 @@ struct dt_probe { > const char *dtp_func; /* [I] probe function */ > const char *dtp_name; /* [I] probe name */ > uint32_t dtp_pbn; /* [I] unique ID */ > - volatile uint32_t dtp_recording; /* [D] is it recording? */ > - uint8_t dtp_nargs; /* [I] # of arguments */ > + volatile uint32_t dtp_recording; /* [d] is it recording? */ > + > + struct mutex dtp_mtx; > + unsigned dtp_ref; /* [m] number of pcb referencing this probe*/ > > /* Provider specific fields. */ > - int dtp_sysnum; /* [I] related # of syscall */ > - const char *dtp_argtype[5];/* [I] type of arguments */ > + int dtp_sysnum; /* [I] related # of syscall */ > + const char *dtp_argtype[5];/* [I] type of arguments */ > + int dtp_nargs; /* [I] # of arguments */ > + vaddr_t dtp_addr; /* [I] address of breakpint */ > }; > > > @@ -231,13 +236,18 @@ struct dt_provider { > > int (*dtpv_alloc)(struct dt_probe *, struct dt_softc *, > struct dt_pcb_list *, struct dtioc_req *); > - void (*dtpv_enter)(struct dt_provider *, ...); > + int (*dtpv_enter)(struct dt_provider *, ...); > void (*dtpv_leave)(struct dt_provider *, ...); > + int (*dtpv_desalloc)(struct dt_probe *, struct dt_softc *, > + struct dtioc_req *); > }; > > +extern struct dt_provider dt_prov_kprobe; > + > int dt_prov_profile_init(void); > int dt_prov_syscall_init(void); > int dt_prov_static_init(void); > +int dt_prov_kprobe_init(void); > > struct dt_probe *dt_dev_alloc_probe(const char *, const char *, > struct dt_provider *); > diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c > index 1de0726ab95..3f8d59d7832 100644 > --- a/sys/kern/subr_prof.c > +++ b/sys/kern/subr_prof.c > @@ -70,10 +70,6 @@ prof_init(void) > char *cp; > int size; > > -#if !defined(GPROF) && defined(DDBPROF) > - db_prof_init(); > -#endif > - > /* > * Round lowpc and highpc to multiples of the density we're using > * so the rest of the scaling (here and in gprof) stays in ints. > diff --git a/usr.sbin/btrace/btrace.c b/usr.sbin/btrace/btrace.c > index 7b77ceb1366..fabbb4f78c0 100644 > --- a/usr.sbin/btrace/btrace.c > +++ b/usr.sbin/btrace/btrace.c > @@ -498,19 +498,24 @@ rules_teardown(int fd) > struct bt_rule *r, *rend = NULL; > int dokstack = 0, off = 0; > > + > if (g_nprobes > 0) { > if (ioctl(fd, DTIOCRECORD, &off)) > err(1, "DTIOCRECORD"); > } > > + > TAILQ_FOREACH(r, &g_rules, br_next) { > + dtrq = r->br_cookie; > if (r->br_type != B_RT_PROBE) { > if (r->br_type == B_RT_END) > rend = r; > continue; > - } > + } else { > + if (ioctl(fd, DTIOCPRBDISABLE, dtrq)) > + err(1, "DTIOCPRBDISABLE"); > + } > > - dtrq = r->br_cookie; > if (dtrq->dtrq_evtflags & DTEVT_KSTACK) > dokstack = 1; > } > @@ -657,8 +662,7 @@ builtin_arg(struct dt_evt *dtev, enum bt_argtype dat) > static char buf[sizeof("18446744073709551615")]; /* UINT64_MAX */ > > snprintf(buf, sizeof(buf) - 1, "%lu", > - dtev->dtev_sysargs[dat - B_AT_BI_ARG0]); > - > + dtev->dtev_args[dat - B_AT_BI_ARG0]); > return buf; > } > > @@ -1088,7 +1092,7 @@ ba2str(struct bt_arg *ba, struct dt_evt *dtev) > str = builtin_arg(dtev, ba->ba_type); > break; > case B_AT_BI_RETVAL: > - snprintf(buf, sizeof(buf) - 1, "%ld", (long)dtev->dtev_sysretval[0]); > + snprintf(buf, sizeof(buf) - 1, "%ld", (long)dtev->dtev_retval[0]); > str = buf; > break; > case B_AT_MAP: