I'd like to import this new MI pseudo-driver and the framework it provides to instrument and inspect kernel internals.
It is still under development and all the code is guarded under NDT, so it shouldn't impact GENERIC. However at this stage of development I'm interested to get code reviews and feedbacks. The design is fairly simple: events, in the form of descriptors on a ring, are being produced in any kernel context and being consumed by a userland process reading /dev/dt. struct dt_evt { unsigned int dtev_pbn; /* Probe number */ unsigned int dtev_cpu; /* CPU id */ pid_t dtev_pid; /* ID of current process */ pid_t dtev_tid; /* ID of current thread */ struct timespec dtev_tsp; /* timestamp (nsecs) */ struct dt_stack_trace dtev_kstack; /* kernel stack frame */ ... }; To decide when an event needs to be recorded, tracepoints, also known as probes, are placed in the code and enabled on demand. For the moment the framework allows to define static tracepoints via the TRACEPOINT() macro and also exposes per-syscall and hardclock probes allowing to generate flamegraphs. I'd appreciate particular review of the following items: * Event producer/consumer code which currently needs a mutex. The current implementation doesn't always use a PCB per-CPU. Moving to a lockless implementation would be beneficial for recording in deeper places of the kernel. * Barriers for enabling/disabling probes. dt(4) can be opened multiple times allowing multiple processes to gather event in parallel. I'd like to be sure a close(2) doesn't stop another thread from capturing events. * Read wakeup. Currently a mutex is used to not loose a wakeup event in dtread(), however this mutex creates lock ordering problems with the SCHED_LOCK() when probes are executed to profile the scheduler. I included changes to MAKEDEV on all archs where the dev number 30 is still free. I'll pick the closest number for others archs if that's ok. The man page doesn't include a list of ioctl(2) as I believe they will change soon. I'll send another mail for the userland interface. Ok to continue in-tree? Index: etc/MAKEDEV.common =================================================================== RCS file: /cvs/src/etc/MAKEDEV.common,v retrieving revision 1.107 diff -u -p -r1.107 MAKEDEV.common --- etc/MAKEDEV.common 17 Dec 2019 13:15:17 -0000 1.107 +++ etc/MAKEDEV.common 16 Jan 2020 18:31:56 -0000 @@ -167,6 +167,7 @@ target(all, vmm)dnl target(all, pvbus, 0, 1)dnl target(all, bpf)dnl target(all, kcov)dnl +target(all, dt)dnl dnl _mkdev(all, {-all-}, {-dnl show_target(all)dnl @@ -528,3 +529,5 @@ _mkdev(pvbus, {-pvbus*-}, {-M pvbus$U c _mkdev(local, local, {-test -s $T.local && sh $T.local-})dnl __devitem(kcov, kcov, Kernel code coverage tracing)dnl _mkdev(kcov, kcov, {-M kcov c major_kcov_c 0 600-})dnl +__devitem(dt, dt, Dynamic Tracer)dnl +_mkdev(dt, dt, {-M dt c major_dt_c 0 600-})dnl Index: etc/etc.amd64/MAKEDEV.md =================================================================== RCS file: /cvs/src/etc/etc.amd64/MAKEDEV.md,v retrieving revision 1.72 diff -u -p -r1.72 MAKEDEV.md --- etc/etc.amd64/MAKEDEV.md 17 Dec 2019 13:08:54 -0000 1.72 +++ etc/etc.amd64/MAKEDEV.md 16 Jan 2020 18:31:56 -0000 @@ -68,6 +68,7 @@ _DEV(au, 42) _DEV(bio, 79) _DEV(bktr, 49) _DEV(bpf, 23) +_DEV(dt, 30) _DEV(diskmap, 90) _DEV(drm, 87) _DEV(fdesc, 22) Index: etc/etc.arm64/MAKEDEV.md =================================================================== RCS file: /cvs/src/etc/etc.arm64/MAKEDEV.md,v retrieving revision 1.5 diff -u -p -r1.5 MAKEDEV.md --- etc/etc.arm64/MAKEDEV.md 22 Dec 2019 18:18:02 -0000 1.5 +++ etc/etc.arm64/MAKEDEV.md 16 Jan 2020 18:31:56 -0000 @@ -60,6 +60,7 @@ _DEV(au, 42) _DEV(bio, 79) _DEV(bktr, 49) _DEV(bpf, 23) +_DEV(dt, 30) _DEV(diskmap, 90) _DEV(drm, 87) _DEV(fdesc, 22) Index: etc/etc.armv7/MAKEDEV.md =================================================================== RCS file: /cvs/src/etc/etc.armv7/MAKEDEV.md,v retrieving revision 1.15 diff -u -p -r1.15 MAKEDEV.md --- etc/etc.armv7/MAKEDEV.md 17 Dec 2019 13:08:55 -0000 1.15 +++ etc/etc.armv7/MAKEDEV.md 16 Jan 2020 18:31:56 -0000 @@ -69,6 +69,7 @@ _DEV(au, 36) _DEV(bio, 52) _DEV(bktr, 75) _DEV(bpf, 22) +_DEV(dt, 30) _DEV(diskmap, 102) _DEV(fdesc, 7) _DEV(fuse, 77) Index: etc/etc.i386/MAKEDEV.md =================================================================== RCS file: /cvs/src/etc/etc.i386/MAKEDEV.md,v retrieving revision 1.87 diff -u -p -r1.87 MAKEDEV.md --- etc/etc.i386/MAKEDEV.md 17 Dec 2019 13:08:55 -0000 1.87 +++ etc/etc.i386/MAKEDEV.md 16 Jan 2020 18:31:56 -0000 @@ -70,6 +70,7 @@ _DEV(au, 42) _DEV(bio, 79) _DEV(bktr, 49) _DEV(bpf, 23) +_DEV(dt, 30) _DEV(diskmap, 91) _DEV(drm, 88) _DEV(fdesc, 22) Index: etc/etc.landisk/MAKEDEV.md =================================================================== RCS file: /cvs/src/etc/etc.landisk/MAKEDEV.md,v retrieving revision 1.44 diff -u -p -r1.44 MAKEDEV.md --- etc/etc.landisk/MAKEDEV.md 17 Dec 2019 13:08:55 -0000 1.44 +++ etc/etc.landisk/MAKEDEV.md 16 Jan 2020 18:31:56 -0000 @@ -73,6 +73,7 @@ _DEV(au, 36) _DEV(bio, 37) dnl _DEV(bktr, 75) _DEV(bpf, 22) +_DEV(dt, 30) _DEV(diskmap,101) _DEV(fdesc, 7) _DEV(fuse, 103) Index: etc/etc.loongson/MAKEDEV.md =================================================================== RCS file: /cvs/src/etc/etc.loongson/MAKEDEV.md,v retrieving revision 1.28 diff -u -p -r1.28 MAKEDEV.md --- etc/etc.loongson/MAKEDEV.md 17 Dec 2019 13:08:56 -0000 1.28 +++ etc/etc.loongson/MAKEDEV.md 16 Jan 2020 18:31:56 -0000 @@ -67,6 +67,7 @@ _DEV(apm, 14) _DEV(au, 44) _DEV(bio, 49) _DEV(bpf, 12) +_DEV(dt, 30) _DEV(diskmap, 70) _DEV(drm, 87) _DEV(fdesc, 7) Index: etc/etc.luna88k/MAKEDEV.md =================================================================== RCS file: /cvs/src/etc/etc.luna88k/MAKEDEV.md,v retrieving revision 1.32 diff -u -p -r1.32 MAKEDEV.md --- etc/etc.luna88k/MAKEDEV.md 31 Dec 2016 00:50:01 -0000 1.32 +++ etc/etc.luna88k/MAKEDEV.md 16 Jan 2020 18:31:56 -0000 @@ -101,6 +101,7 @@ _TITLE(spec) _DEV(au, 26) _DEV(bio, 49) _DEV(bpf, 22) +_DEV(dt, 30) _DEV(diskmap, 54) _DEV(fdesc, 21) _DEV(fuse, 45) Index: etc/etc.macppc/MAKEDEV.md =================================================================== RCS file: /cvs/src/etc/etc.macppc/MAKEDEV.md,v retrieving revision 1.71 diff -u -p -r1.71 MAKEDEV.md --- etc/etc.macppc/MAKEDEV.md 17 Dec 2019 13:08:56 -0000 1.71 +++ etc/etc.macppc/MAKEDEV.md 16 Jan 2020 18:31:56 -0000 @@ -78,6 +78,7 @@ _DEV(au, 44) _DEV(bio, 80) _DEV(bktr, 75) _DEV(bpf, 22) +_DEV(dt, 30) _DEV(diskmap, 84) _DEV(drm, 87) _DEV(fdesc, 21) Index: etc/etc.octeon/MAKEDEV.md =================================================================== RCS file: /cvs/src/etc/etc.octeon/MAKEDEV.md,v retrieving revision 1.15 diff -u -p -r1.15 MAKEDEV.md --- etc/etc.octeon/MAKEDEV.md 17 Dec 2019 13:08:56 -0000 1.15 +++ etc/etc.octeon/MAKEDEV.md 16 Jan 2020 18:31:56 -0000 @@ -71,6 +71,7 @@ _TITLE(spec) _DEV(au, 44) _DEV(bio, 49) _DEV(bpf, 12) +_DEV(dt, 30) _DEV(diskmap, 70) _DEV(fdesc, 7) _DEV(fuse, 53) Index: etc/etc.sgi/MAKEDEV.md =================================================================== RCS file: /cvs/src/etc/etc.sgi/MAKEDEV.md,v retrieving revision 1.50 diff -u -p -r1.50 MAKEDEV.md --- etc/etc.sgi/MAKEDEV.md 17 Dec 2019 13:08:56 -0000 1.50 +++ etc/etc.sgi/MAKEDEV.md 16 Jan 2020 18:31:56 -0000 @@ -75,6 +75,7 @@ _TITLE(spec) _DEV(au, 44) _DEV(bio, 49) _DEV(bpf, 12) +_DEV(dt, 30) _DEV(diskmap, 69) _DEV(fdesc, 7) _DEV(fuse, 73) Index: etc/etc.sparc64/MAKEDEV.md =================================================================== RCS file: /cvs/src/etc/etc.sparc64/MAKEDEV.md,v retrieving revision 1.89 diff -u -p -r1.89 MAKEDEV.md --- etc/etc.sparc64/MAKEDEV.md 17 Dec 2019 13:08:56 -0000 1.89 +++ etc/etc.sparc64/MAKEDEV.md 16 Jan 2020 18:31:56 -0000 @@ -113,6 +113,7 @@ _DEV(bpf, 105) _DEV(diskmap, 130) _DEV(drm, 87) _DEV(fdesc, 24) +_DEV(dt, 30) _DEV(fuse, 134) _DEV(hotplug, 124) _DEV(oppr) Index: share/man/man4/dt.4 =================================================================== RCS file: share/man/man4/dt.4 diff -N share/man/man4/dt.4 --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ share/man/man4/dt.4 16 Jan 2020 18:34:24 -0000 @@ -0,0 +1,47 @@ +.\" $OpenBSD$ +.\" +.\" Copyright (c) 2019 Martin Pieuchot <m...@openbsd.org> +.\" +.\" Permission to use, copy, modify, and distribute this software for any +.\" purpose with or without fee is hereby granted, provided that the above +.\" copyright notice and this permission notice appear in all copies. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +.\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +.\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +.\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +.\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +.\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF +.\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +.\" +.Dd $Mdocdate: November 14 2019 $ +.Dt DT 4 +.Os +.Sh NAME +.Nm dt +.Nd dynamic tracer +.Sh SYNOPSIS +.Cd "pseudo-device dt" +.Sh DESCRIPTION +System and application tracing can happen in the kernel. +It has to be configured and enabled through the +.Xr ioctl 2 +interface exposed by the pseudo-device +.Pa /dev/dt . +.\"Sh IOCTL INTERFACE +.\" +.Sh FILES +.Bl -tag -width /dev/dt -compact +.It Pa /dev/dt +dynamic tracing device. +.El +.Sh SEE ALSO +.Xr ioctl 2 , +.Xr bt 5 +and +.Xr btrace 8 . +.Sh HISTORY +The +.Nm +dynamic tracing mechanism first appeared in +.Ox 6.7 . Index: sys/arch/amd64/amd64/conf.c =================================================================== RCS file: /cvs/src/sys/arch/amd64/amd64/conf.c,v retrieving revision 1.65 diff -u -p -r1.65 conf.c --- sys/arch/amd64/amd64/conf.c 17 Dec 2019 13:08:54 -0000 1.65 +++ sys/arch/amd64/amd64/conf.c 16 Jan 2020 18:31:56 -0000 @@ -176,6 +176,7 @@ cdev_decl(viocon); cdev_decl(pci); #endif +#include "dt.h" #include "pf.h" #include "hotplug.h" #include "gpio.h" @@ -223,7 +224,7 @@ struct cdevsw cdevsw[] = cdev_spkr_init(NSPKR,spkr), /* 27: PC speaker */ cdev_notdef(), /* 28 was LKM */ cdev_notdef(), /* 29 */ - cdev_notdef(), /* 30 */ + cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ cdev_notdef(), /* 31 */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ Index: sys/arch/arm/arm/conf.c =================================================================== RCS file: /cvs/src/sys/arch/arm/arm/conf.c,v retrieving revision 1.52 diff -u -p -r1.52 conf.c --- sys/arch/arm/arm/conf.c 18 Dec 2019 06:53:46 -0000 1.52 +++ sys/arch/arm/arm/conf.c 16 Jan 2020 18:31:56 -0000 @@ -68,6 +68,7 @@ * Standard pseudo-devices */ #include "bpfilter.h" +#include "dt.h" #include "pf.h" #include "pty.h" #include "tun.h" @@ -299,7 +300,7 @@ struct cdevsw cdevsw[] = { cdev_ch_init(NCH,ch), /* 27: SCSI autochanger */ cdev_uk_init(NUK,uk), /* 28: SCSI unknown */ cdev_notdef(), /* 29: */ - cdev_notdef(), /* 30: */ + cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ cdev_notdef(), /* 31: */ cdev_notdef(), /* 32: */ cdev_tun_init(NTUN,tun), /* 33: network tunnel */ Index: sys/arch/arm64/arm64/conf.c =================================================================== RCS file: /cvs/src/sys/arch/arm64/arm64/conf.c,v retrieving revision 1.10 diff -u -p -r1.10 conf.c --- sys/arch/arm64/arm64/conf.c 22 Dec 2019 18:18:02 -0000 1.10 +++ sys/arch/arm64/arm64/conf.c 16 Jan 2020 18:31:56 -0000 @@ -139,6 +139,7 @@ cdev_decl(drm); cdev_decl(pci); #endif +#include "dt.h" #include "pf.h" #include "hotplug.h" #include "vscsi.h" @@ -182,7 +183,7 @@ struct cdevsw cdevsw[] = cdev_notdef(), /* 27 */ cdev_notdef(), /* 28 was LKM */ cdev_notdef(), /* 29 */ - cdev_notdef(), /* 30 */ + cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ cdev_notdef(), /* 31 */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ Index: sys/arch/i386/i386/conf.c =================================================================== RCS file: /cvs/src/sys/arch/i386/i386/conf.c,v retrieving revision 1.164 diff -u -p -r1.164 conf.c --- sys/arch/i386/i386/conf.c 17 Dec 2019 13:08:55 -0000 1.164 +++ sys/arch/i386/i386/conf.c 16 Jan 2020 18:31:56 -0000 @@ -168,6 +168,7 @@ cdev_decl(drm); cdev_decl(pci); #endif +#include "dt.h" #include "pf.h" #include "hotplug.h" #include "gpio.h" @@ -216,7 +217,7 @@ struct cdevsw cdevsw[] = cdev_spkr_init(NSPKR,spkr), /* 27: PC speaker */ cdev_notdef(), /* 28: was LKM */ cdev_notdef(), /* 29 */ - cdev_notdef(), /* 30 */ + cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ cdev_notdef(), /* 31 */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ Index: sys/arch/landisk/landisk/conf.c =================================================================== RCS file: /cvs/src/sys/arch/landisk/landisk/conf.c,v retrieving revision 1.39 diff -u -p -r1.39 conf.c --- sys/arch/landisk/landisk/conf.c 17 Dec 2019 13:08:55 -0000 1.39 +++ sys/arch/landisk/landisk/conf.c 16 Jan 2020 18:31:56 -0000 @@ -67,6 +67,7 @@ * Standard pseudo-devices */ #include "bpfilter.h" +#include "dt.h" #include "pf.h" #include "bio.h" #include "pty.h" @@ -273,7 +274,7 @@ struct cdevsw cdevsw[] = { cdev_ch_init(NCH,ch), /* 27: SCSI autochanger */ cdev_uk_init(NUK,uk), /* 28: SCSI unknown */ cdev_notdef(), /* 29: */ - cdev_notdef(), /* 30: */ + cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ cdev_notdef(), /* 31: */ cdev_notdef(), /* 32: */ cdev_tun_init(NTUN,tun), /* 33: network tunnel */ Index: sys/arch/loongson/loongson/conf.c =================================================================== RCS file: /cvs/src/sys/arch/loongson/loongson/conf.c,v retrieving revision 1.26 diff -u -p -r1.26 conf.c --- sys/arch/loongson/loongson/conf.c 20 Dec 2019 13:25:02 -0000 1.26 +++ sys/arch/loongson/loongson/conf.c 16 Jan 2020 18:31:56 -0000 @@ -116,6 +116,7 @@ cdev_decl(wd); #include "pci.h" cdev_decl(pci); +#include "dt.h" #include "pf.h" #include "usb.h" @@ -166,7 +167,7 @@ struct cdevsw cdevsw[] = #else cdev_notdef(), /* 29 */ #endif - cdev_notdef(), /* 30: */ + cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ cdev_pf_init(NPF,pf), /* 31: packet filter */ cdev_uk_init(NUK,uk), /* 32: unknown SCSI */ cdev_random_init(1,random), /* 33: random data source */ Index: sys/arch/luna88k/luna88k/conf.c =================================================================== RCS file: /cvs/src/sys/arch/luna88k/luna88k/conf.c,v retrieving revision 1.31 diff -u -p -r1.31 conf.c --- sys/arch/luna88k/luna88k/conf.c 17 Dec 2016 05:22:34 -0000 1.31 +++ sys/arch/luna88k/luna88k/conf.c 16 Jan 2020 18:31:56 -0000 @@ -67,6 +67,7 @@ #include "wsmouse.h" #include "wsmux.h" +#include "dt.h" #include "pf.h" #include "vscsi.h" #include "pppx.h" @@ -131,7 +132,7 @@ struct cdevsw cdevsw[] = cdev_tty_init(NCOM, com), /* 27: serial port (on PCMCIA) */ cdev_disk_init(NWD,wd), /* 28: IDE disk (on PCMCIA) */ cdev_notdef(), /* 29 */ - cdev_notdef(), /* 30 */ + cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ cdev_notdef(), /* 31 */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ Index: sys/arch/macppc/macppc/conf.c =================================================================== RCS file: /cvs/src/sys/arch/macppc/macppc/conf.c,v retrieving revision 1.69 diff -u -p -r1.69 conf.c --- sys/arch/macppc/macppc/conf.c 17 Dec 2019 13:08:56 -0000 1.69 +++ sys/arch/macppc/macppc/conf.c 16 Jan 2020 18:31:56 -0000 @@ -116,6 +116,7 @@ cdev_decl(pci); #include "video.h" #include "midi.h" +#include "dt.h" #include "pf.h" #include "radio.h" @@ -159,7 +160,7 @@ struct cdevsw cdevsw[] = { cdev_notdef(), /* 27 */ cdev_notdef(), /* 28 */ cdev_notdef(), /* 29 */ - cdev_notdef(), /* 30 */ + cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ cdev_notdef(), /* 31 */ cdev_notdef(), /* 32 */ cdev_notdef(), /* 33 */ Index: sys/arch/octeon/octeon/conf.c =================================================================== RCS file: /cvs/src/sys/arch/octeon/octeon/conf.c,v retrieving revision 1.22 diff -u -p -r1.22 conf.c --- sys/arch/octeon/octeon/conf.c 17 Dec 2019 13:08:56 -0000 1.22 +++ sys/arch/octeon/octeon/conf.c 16 Jan 2020 18:31:56 -0000 @@ -129,6 +129,7 @@ cdev_decl(amdcf); #include "pci.h" cdev_decl(pci); +#include "dt.h" #include "pf.h" #include "usb.h" @@ -185,7 +186,7 @@ struct cdevsw cdevsw[] = #else cdev_notdef(), /* 29 */ #endif - cdev_notdef(), /* 30: */ + cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ cdev_pf_init(NPF,pf), /* 31: packet filter */ cdev_uk_init(NUK,uk), /* 32: unknown SCSI */ cdev_random_init(1,random), /* 33: random data source */ Index: sys/arch/sgi/sgi/conf.c =================================================================== RCS file: /cvs/src/sys/arch/sgi/sgi/conf.c,v retrieving revision 1.40 diff -u -p -r1.40 conf.c --- sys/arch/sgi/sgi/conf.c 17 Dec 2019 13:08:56 -0000 1.40 +++ sys/arch/sgi/sgi/conf.c 16 Jan 2020 18:31:56 -0000 @@ -115,6 +115,7 @@ cdev_decl(wd); #include "pci.h" cdev_decl(pci); +#include "dt.h" #include "pf.h" #include "usb.h" @@ -165,7 +166,7 @@ struct cdevsw cdevsw[] = #else cdev_notdef(), /* 29 */ #endif - cdev_notdef(), /* 30: */ + cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ cdev_pf_init(NPF,pf), /* 31: packet filter */ cdev_uk_init(NUK,uk), /* 32: unknown SCSI */ cdev_random_init(1,random), /* 33: random data source */ Index: sys/arch/sparc64/sparc64/conf.c =================================================================== RCS file: /cvs/src/sys/arch/sparc64/sparc64/conf.c,v retrieving revision 1.81 diff -u -p -r1.81 conf.c --- sys/arch/sparc64/sparc64/conf.c 17 Dec 2019 13:08:56 -0000 1.81 +++ sys/arch/sparc64/sparc64/conf.c 16 Jan 2020 18:31:56 -0000 @@ -108,6 +108,7 @@ cdev_decl(pci); #include "ulpt.h" #include "ucom.h" +#include "dt.h" #include "pf.h" #include "ksyms.h" @@ -181,7 +182,7 @@ struct cdevsw cdevsw[] = cdev_notdef(), /* 27 */ cdev_notdef(), /* 28: Systech VPC-2200 versatec/centronics */ cdev_notdef(), /* 29 */ - cdev_notdef(), /* 30: Xylogics tape */ + cdev_dt_init(NDT,dt), /* 30: dynamic tracer */ cdev_notdef(), /* 31: /dev/cgtwo */ cdev_notdef(), /* 32: should be /dev/gpone */ cdev_notdef(), /* 33 */ Index: sys/conf/GENERIC =================================================================== RCS file: /cvs/src/sys/conf/GENERIC,v retrieving revision 1.266 diff -u -p -r1.266 GENERIC --- sys/conf/GENERIC 12 Oct 2019 17:06:02 -0000 1.266 +++ sys/conf/GENERIC 16 Jan 2020 18:31:56 -0000 @@ -82,6 +82,7 @@ pseudo-device msts 1 # MSTS line discipl pseudo-device endrun 1 # EndRun line discipline pseudo-device vnd 4 # vnode disk devices pseudo-device ksyms 1 # kernel symbols device +#pseudo-device dt # Dynamic Tracer # clonable devices pseudo-device bpfilter # packet filter Index: sys/conf/files =================================================================== RCS file: /cvs/src/sys/conf/files,v retrieving revision 1.680 diff -u -p -r1.680 files --- sys/conf/files 11 Jan 2020 00:56:38 -0000 1.680 +++ sys/conf/files 16 Jan 2020 18:31:56 -0000 @@ -603,6 +603,12 @@ file net/if_pppoe.c pppoe needs-flag pseudo-device kcov file dev/kcov.c kcov needs-flag +pseudo-device dt +file dev/dt/dt_dev.c dt needs-flag +file dev/dt/dt_prov_profile.c dt +file dev/dt/dt_prov_syscall.c dt +file dev/dt/dt_prov_static.c dt + # XXX machine-independent SCSI files should live somewhere here, maybe # kernel sources Index: sys/ddb/db_access.c =================================================================== RCS file: /cvs/src/sys/ddb/db_access.c,v retrieving revision 1.16 diff -u -p -r1.16 db_access.c --- sys/ddb/db_access.c 7 Nov 2019 13:16:25 -0000 1.16 +++ sys/ddb/db_access.c 16 Jan 2020 18:31:56 -0000 @@ -33,19 +33,17 @@ #include <sys/param.h> #include <sys/endian.h> -#include <machine/db_machdep.h> /* type definitions */ - #include <ddb/db_access.h> /* * Access unaligned data items on aligned (longword) * boundaries. */ -db_expr_t +long db_get_value(vaddr_t addr, size_t size, int is_signed) { - char data[sizeof(db_expr_t)]; - db_expr_t value, extend; + char data[sizeof(long)]; + long value, extend; int i; #ifdef DIAGNOSTIC @@ -56,7 +54,7 @@ db_get_value(vaddr_t addr, size_t size, db_read_bytes(addr, size, data); value = 0; - extend = (~(db_expr_t)0) << (size * 8 - 1); + extend = (~(long)0) << (size * 8 - 1); #if BYTE_ORDER == LITTLE_ENDIAN for (i = size - 1; i >= 0; i--) #else /* BYTE_ORDER == BIG_ENDIAN */ @@ -64,15 +62,15 @@ db_get_value(vaddr_t addr, size_t size, #endif /* BYTE_ORDER */ value = (value << 8) + (data[i] & 0xFF); - if (size < sizeof(db_expr_t) && is_signed && (value & extend)) + if (size < sizeof(long) && is_signed && (value & extend)) value |= extend; return (value); } void -db_put_value(vaddr_t addr, size_t size, db_expr_t value) +db_put_value(vaddr_t addr, size_t size, long value) { - char data[sizeof(db_expr_t)]; + char data[sizeof(long)]; int i; #ifdef DIAGNOSTIC Index: sys/ddb/db_access.h =================================================================== RCS file: /cvs/src/sys/ddb/db_access.h,v retrieving revision 1.10 diff -u -p -r1.10 db_access.h --- sys/ddb/db_access.h 7 Nov 2019 13:16:25 -0000 1.10 +++ sys/ddb/db_access.h 16 Jan 2020 18:31:56 -0000 @@ -33,8 +33,8 @@ /* * Data access functions for debugger. */ -db_expr_t db_get_value(vaddr_t, size_t, int); -void db_put_value(vaddr_t, size_t, db_expr_t); +long db_get_value(vaddr_t, size_t, int); +void db_put_value(vaddr_t, size_t, long); void db_read_bytes(vaddr_t, size_t, char *); void db_write_bytes(vaddr_t, size_t, char *); Index: sys/dev/dt/dt_dev.c =================================================================== RCS file: sys/dev/dt/dt_dev.c diff -N sys/dev/dt/dt_dev.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/dev/dt/dt_dev.c 16 Jan 2020 18:31:56 -0000 @@ -0,0 +1,722 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2019 Martin Pieuchot <m...@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/device.h> +#include <sys/malloc.h> +#include <sys/proc.h> + +#include <dev/dt/dtvar.h> + +#ifdef DDB +#include <ddb/db_access.h> +/* + * How many frames are used by the profiling code? For example + * on amd64: + * + * From syscall provider: + * + * dt_prov_syscall_entry+0x141 + * syscall+0x205 <--- start here + * Xsyscall+0x128 + * + * From profile provider: + * + * dt_prov_profile_enter+0x6e + * hardclock+0x12c + * clockintr+0x59 + * intr_handler+0x6e + * Xresume_legacy0+0x1d3 + * cpu_idle_cycle+0x1b <---- start here. + * proc_trampoline+0x1c + */ +#if notyet +#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(4) +#else +#define DT_HOOK_FRAME_ADDRESS __builtin_frame_address(0) +#endif +#endif /* DDB */ + +#define DT_EVTRING_SIZE 16 /* # of slots in per PCB event ring */ + +#define DPRINTF(x...) /* nothing */ + +/* + * Descriptor associated with each program opening /dev/dt. It is used + * to keep track of enabled PCBs. + * + * Locks used to protect struct members in this file: + * m per-softc mutex + * k kernel lock + */ +struct dt_softc { + SLIST_ENTRY(dt_softc) ds_next; /* [k] descriptor list */ + int ds_unit; /* [I] D_CLONE unique unit */ + pid_t ds_pid; /* [I] PID of tracing program */ + + struct mutex ds_mtx; + + struct dt_pcb_list ds_pcbs; /* [k] list of enabled PCBs */ + struct dt_evt *ds_bufqueue; /* [k] copy evts to userland */ + size_t ds_bufqlen; /* [k] length of the queue */ + int ds_recording; /* [k] currently recording? */ + int ds_evtcnt; /* [m] # of readable evts */ + + /* Counters */ + uint64_t ds_readevt; /* [m] # of events read */ + uint64_t ds_dropevt; /* [m] # of events dropped */ +}; + +SLIST_HEAD(, dt_softc) dtdev_list; /* [k] list of open /dev/dt nodes */ + +/* + * Probes are created during dt_attach() and never modified/freed during + * the lifetime of the system. That's why we consider them as [I]mmutable. + */ +unsigned int dt_nprobes; /* [I] # of probes available */ +SIMPLEQ_HEAD(, dt_probe) dt_probe_list; /* [I] list of probes */ + +struct rwlock dt_lock = RWLOCK_INITIALIZER("dtlk"); +volatile uint32_t dt_tracing = 0; /* [d] # of processes tracing */ + +void dtattach(struct device *, struct device *, void *); +int dtopen(dev_t, int, int, struct proc *); +int dtclose(dev_t, int, int, struct proc *); +int dtread(dev_t, struct uio *, int); +int dtioctl(dev_t, u_long, caddr_t, int, struct proc *); + +struct dt_softc *dtlookup(int); + +int dt_ioctl_list_probes(struct dt_softc *, struct dtioc_probe *); +int dt_ioctl_get_stats(struct dt_softc *, struct dtioc_stat *); +int dt_ioctl_record_start(struct dt_softc *); +void dt_ioctl_record_stop(struct dt_softc *); +int dt_ioctl_probe_enable(struct dt_softc *, struct dtioc_req *); +void dt_ioctl_probe_disable(struct dt_softc *, struct dtioc_req *); + +int dt_enter(void); +void dt_leave(uint32_t); + +int dt_pcb_ring_copy(struct dt_pcb *, struct dt_evt *, size_t, uint64_t *); + +void +dtattach(struct device *parent, struct device *self, void *aux) +{ + SLIST_INIT(&dtdev_list); + SIMPLEQ_INIT(&dt_probe_list); + + /* Init providers */ + dt_nprobes += dt_prov_profile_init(); + dt_nprobes += dt_prov_syscall_init(); + dt_nprobes += dt_prov_static_init(); + + printf("dt: %u probes\n", dt_nprobes); +} + +int +dtopen(dev_t dev, int flags, int mode, struct proc *p) +{ + struct dt_softc *sc; + int unit = minor(dev); + + KASSERT(dtlookup(unit) == NULL); + + sc = malloc(sizeof(*sc), M_DEVBUF, M_WAITOK|M_CANFAIL|M_ZERO); + if (sc == NULL) + return ENOMEM; + + /* + * Enough space to empty 2 full rings of events in a single read. + */ + sc->ds_bufqlen = 2 * DT_EVTRING_SIZE; + sc->ds_bufqueue = mallocarray(sc->ds_bufqlen, sizeof(*sc->ds_bufqueue), + M_DEVBUF, M_WAITOK|M_CANFAIL); + if (sc->ds_bufqueue == NULL) + goto bad; + + sc->ds_unit = unit; + sc->ds_pid = p->p_p->ps_pid; + TAILQ_INIT(&sc->ds_pcbs); + mtx_init(&sc->ds_mtx, IPL_HIGH); + sc->ds_evtcnt = 0; + sc->ds_readevt = 0; + sc->ds_dropevt = 0; + + SLIST_INSERT_HEAD(&dtdev_list, sc, ds_next); + + DPRINTF("dt%d: pid %d open\n", sc->ds_unit, sc->ds_pid); + + return 0; + +bad: + free(sc, M_DEVBUF, sizeof(*sc)); + return ENOMEM; +} + +int +dtclose(dev_t dev, int flags, int mode, struct proc *p) +{ + struct dt_softc *sc; + int unit = minor(dev); + + sc = dtlookup(unit); + KASSERT(sc != NULL); + + DPRINTF("dt%d: pid %d close\n", sc->ds_unit, sc->ds_pid); + + SLIST_REMOVE(&dtdev_list, sc, dt_softc, ds_next); + dt_ioctl_record_stop(sc); + dt_pcb_purge(&sc->ds_pcbs); + + free(sc->ds_bufqueue, M_DEVBUF, + sc->ds_bufqlen * sizeof(*sc->ds_bufqueue)); + free(sc, M_DEVBUF, sizeof(*sc)); + + return 0; +} + +int +dtread(dev_t dev, struct uio *uio, int flags) +{ + struct dt_softc *sc; + struct dt_evt *estq; + struct dt_pcb *dp; + int error, unit = minor(dev); + size_t qlen, count, read = 0; + uint64_t dropped = 0; + + sc = dtlookup(unit); + KASSERT(sc != NULL); + + count = howmany(uio->uio_resid, sizeof(struct dt_evt)); + if (count < 1) + return (EMSGSIZE); + + mtx_enter(&sc->ds_mtx); + while (!sc->ds_evtcnt) { + error = msleep(sc, &sc->ds_mtx, PWAIT|PCATCH, "dtread", 0); + if (error == EINTR || error == ERESTART) + break; + } + mtx_leave(&sc->ds_mtx); + + if (error) + return error; + + estq = sc->ds_bufqueue; + qlen = MIN(sc->ds_bufqlen, count); + + KERNEL_ASSERT_LOCKED(); + TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { + count = dt_pcb_ring_copy(dp, estq, qlen, &dropped); + read += count; + estq += count; /* pointer aritmetic */ + qlen -= count; + if (qlen == 0) + break; + } + if (read > 0) + uiomove(sc->ds_bufqueue, read * sizeof(struct dt_evt), uio); + + mtx_enter(&sc->ds_mtx); + sc->ds_evtcnt -= read; + sc->ds_readevt += read; + sc->ds_dropevt += dropped; + mtx_leave(&sc->ds_mtx); + + return 0; +} + +int +dtioctl(dev_t dev, u_long cmd, caddr_t addr, int flag, struct proc *p) +{ + struct dt_softc *sc; + int unit = minor(dev); + int on, error = 0; + + sc = dtlookup(unit); + KASSERT(sc != NULL); + + switch (cmd) { + case DTIOCGPLIST: + return dt_ioctl_list_probes(sc, (struct dtioc_probe *)addr); + case DTIOCGSTATS: + return dt_ioctl_get_stats(sc, (struct dtioc_stat *)addr); + case DTIOCRECORD: + case DTIOCPRBENABLE: + /* root only ioctl(2) */ + break; + default: + return ENOTTY; + } + + if ((error = suser(p)) != 0) + return error; + + switch (cmd) { + case DTIOCRECORD: + on = *(int *)addr; + if (on) + error = dt_ioctl_record_start(sc); + else + dt_ioctl_record_stop(sc); + break; + case DTIOCPRBENABLE: + error = dt_ioctl_probe_enable(sc, (struct dtioc_req *)addr); + break; + default: + KASSERT(0); + } + + return error; +} + +struct dt_softc * +dtlookup(int unit) +{ + struct dt_softc *sc; + + KERNEL_ASSERT_LOCKED(); + + SLIST_FOREACH(sc, &dtdev_list, ds_next) { + if (sc->ds_unit == unit) + break; + } + + return sc; +} + +int +dtioc_req_isvalid(struct dtioc_req *dtrq) +{ + switch (dtrq->dtrq_filter.dtf_operand) { + case DT_OP_NONE: + case DT_OP_EQ: + case DT_OP_NE: + break; + default: + return 0; + } + + switch (dtrq->dtrq_filter.dtf_variable) { + case DT_FV_NONE: + case DT_FV_PID: + case DT_FV_TID: + break; + default: + return 0; + } + + return 1; +} + +int +dt_ioctl_list_probes(struct dt_softc *sc, struct dtioc_probe *dtpr) +{ + struct dtioc_probe_info info, *dtpi; + struct dt_probe *dtp; + size_t size; + int error = 0; + + if (dtpr->dtpr_size == 0) { + dtpr->dtpr_size = dt_nprobes * sizeof(*dtpi); + return 0; + } + + size = dtpr->dtpr_size; + dtpi = dtpr->dtpr_probes; + memset(&info, 0, sizeof(info)); + SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) { + if (size < sizeof(*dtpi)) { + error = ENOSPC; + break; + } + info.dtpi_pbn = dtp->dtp_pbn; + strlcpy(info.dtpi_prov, dtp->dtp_prov->dtpv_name, + sizeof(info.dtpi_prov)); + strlcpy(info.dtpi_func, dtp->dtp_func, sizeof(info.dtpi_func)); + strlcpy(info.dtpi_name, dtp->dtp_name, sizeof(info.dtpi_name)); + error = copyout(&info, dtpi, sizeof(*dtpi)); + if (error) + break; + size -= sizeof(*dtpi); + dtpi++; + }; + + return error; +} + +int +dt_ioctl_get_stats(struct dt_softc *sc, struct dtioc_stat *dtst) +{ + mtx_enter(&sc->ds_mtx); + dtst->dtst_readevt = sc->ds_readevt; + dtst->dtst_dropevt = sc->ds_dropevt; + mtx_leave(&sc->ds_mtx); + + return 0; +} + +int +dt_ioctl_record_start(struct dt_softc *sc) +{ + struct dt_pcb *dp; + int count; + + if (sc->ds_recording) + return EBUSY; + + KERNEL_ASSERT_LOCKED(); + if (TAILQ_EMPTY(&sc->ds_pcbs)) + return ENOENT; + + count = dt_enter(); + TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { + struct dt_probe *dtp = dp->dp_dtp; + + rw_assert_wrlock(&dt_lock); + SMR_SLIST_INSERT_HEAD_LOCKED(&dtp->dtp_pcbs, dp, dp_pnext); + dtp->dtp_recording++; + dtp->dtp_prov->dtpv_recording++; + } + dt_leave(count); + + sc->ds_recording = 1; + dt_tracing++; + + return 0; +} + +void +dt_ioctl_record_stop(struct dt_softc *sc) +{ + struct dt_pcb *dp; + int count; + + KASSERT(suser(curproc) == 0); + + if (!sc->ds_recording) + return; + + DPRINTF("dt%d: pid %d disable\n", sc->ds_unit, sc->ds_pid); + + dt_tracing--; + sc->ds_recording = 0; + + count = dt_enter(); + TAILQ_FOREACH(dp, &sc->ds_pcbs, dp_snext) { + struct dt_probe *dtp = dp->dp_dtp; + + rw_assert_wrlock(&dt_lock); + dtp->dtp_recording--; + dtp->dtp_prov->dtpv_recording--; + SMR_SLIST_REMOVE_LOCKED(&dtp->dtp_pcbs, dp, dt_pcb, dp_pnext); + } + dt_leave(count); +} + +int +dt_ioctl_probe_enable(struct dt_softc *sc, struct dtioc_req *dtrq) +{ + struct dt_pcb_list plist; + struct dt_probe *dtp; + struct dt_pcb *dp; + int error; + + KASSERT(suser(curproc) == 0); + + if (!dtioc_req_isvalid(dtrq)) + return EINVAL; + + SIMPLEQ_FOREACH(dtp, &dt_probe_list, dtp_next) { + if (dtp->dtp_pbn == dtrq->dtrq_pbn) + break; + } + if (dtp == NULL) + return ENOENT; + + TAILQ_INIT(&plist); + error = dtp->dtp_prov->dtpv_alloc(dtp, sc, &plist, dtrq); + if (error) + return error; + + DPRINTF("dt%d: pid %d enable %u : %b\n", sc->ds_unit, sc->ds_pid, + dtrq->dtrq_pbn, (unsigned int)dtrq->dtrq_evtflags, DTEVT_FLAG_BITS); + + /* Append all PCBs to this instance */ + while ((dp = TAILQ_FIRST(&plist)) != NULL) { + TAILQ_REMOVE(&plist, dp, dp_snext); + TAILQ_INSERT_HEAD(&sc->ds_pcbs, dp, dp_snext); + } + + return 0; +} + +int +dt_enter(void) +{ + uint32_t count; + + rw_enter_write(&dt_lock); + count = dt_tracing; + dt_tracing = 0; + + smr_barrier(); + + return count; +} + +void +dt_leave(uint32_t count) +{ + dt_tracing = count; + rw_exit_write(&dt_lock); +} + +struct dt_probe * +dt_dev_alloc_probe(const char *func, const char *name, struct dt_provider *dtpv) +{ + struct dt_probe *dtp; + + dtp = malloc(sizeof(*dtp), M_DT, M_NOWAIT|M_ZERO); + if (dtp == NULL) + return NULL; + + SMR_SLIST_INIT(&dtp->dtp_pcbs); + dtp->dtp_prov = dtpv; + dtp->dtp_func = func; + dtp->dtp_name = name; + dtp->dtp_sysnum = -1; + + return dtp; +} + +void +dt_dev_register_probe(struct dt_probe *dtp) +{ + static uint64_t probe_nb; + + dtp->dtp_pbn = ++probe_nb; + SIMPLEQ_INSERT_TAIL(&dt_probe_list, dtp, dtp_next); +} + +struct dt_pcb * +dt_pcb_alloc(struct dt_probe *dtp, struct dt_softc *sc) +{ + struct dt_pcb *dp; + + dp = malloc(sizeof(*dp), M_DT, M_WAITOK|M_CANFAIL|M_ZERO); + if (dp == NULL) + goto bad; + + dp->dp_ring = mallocarray(DT_EVTRING_SIZE, sizeof(*dp->dp_ring), M_DT, + M_WAITOK|M_CANFAIL|M_ZERO); + if (dp->dp_ring == NULL) + goto bad; + + mtx_init(&dp->dp_mtx, IPL_HIGH); + dp->dp_sc = sc; + dp->dp_dtp = dtp; + return dp; +bad: + dt_pcb_free(dp); + return NULL; +} + +void +dt_pcb_free(struct dt_pcb *dp) +{ + if (dp == NULL) + return; + free(dp->dp_ring, M_DT, DT_EVTRING_SIZE * sizeof(*dp->dp_ring)); + free(dp, M_DT, sizeof(*dp)); +} + +void +dt_pcb_purge(struct dt_pcb_list *plist) +{ + struct dt_pcb *dp; + + while ((dp = TAILQ_FIRST(plist)) != NULL) { + TAILQ_REMOVE(plist, dp, dp_snext); + dt_pcb_free(dp); + } +} + +int +dt_pcb_filter(struct dt_pcb *dp) +{ + struct dt_filter *dtf = &dp->dp_filter; + struct proc *p = curproc; + unsigned int var; + int match = 1; + + /* Filter out tracing program. */ + if (dp->dp_sc->ds_pid == p->p_p->ps_pid) + return 1; + + switch (dtf->dtf_variable) { + case DT_FV_PID: + var = p->p_p->ps_pid; + break; + case DT_FV_TID: + var = p->p_tid; + break; + case DT_FV_NONE: + break; + default: + KASSERT(0); + } + + switch (dtf->dtf_operand) { + case DT_OP_EQ: + match = !!(var == dtf->dtf_value); + break; + case DT_OP_NE: + match = !!(var != dtf->dtf_value); + break; + case DT_OP_NONE: + break; + default: + KASSERT(0); + } + + return !match; +} + + +/* + * Get a reference to the next free event state from the ring. + */ +struct dt_evt * +dt_pcb_ring_get(struct dt_pcb *dp) +{ + struct proc *p = curproc; + struct dt_evt *dtev; + int distance; + + if (dt_pcb_filter(dp)) + return NULL; + + mtx_enter(&dp->dp_mtx); + distance = dp->dp_prod - dp->dp_cons; + if (distance == 1 || distance == (1 - DT_EVTRING_SIZE)) { + /* read(2) isn't finished */ + dp->dp_dropevt++; + mtx_leave(&dp->dp_mtx); + return NULL; + } + + /* + * Save states in next free event slot. + */ + dtev = &dp->dp_ring[dp->dp_cons]; + memset(dtev, 0, sizeof(*dtev)); + + dtev->dtev_pbn = dp->dp_dtp->dtp_pbn; + dtev->dtev_cpu = cpu_number(); + dtev->dtev_pid = p->p_p->ps_pid; + dtev->dtev_tid = p->p_tid; + nanotime(&dtev->dtev_tsp); + + if (ISSET(dp->dp_evtflags, DTEVT_EXECNAME)) + memcpy(dtev->dtev_comm, p->p_p->ps_comm, DTMAXCOMLEN - 1); + +#ifdef DDB + if (ISSET(dp->dp_evtflags, DTEVT_KSTACK|DTEVT_USTACK)) { + struct db_stack_trace *dbst; + + dbst =(struct db_stack_trace *)&dtev->dtev_kstack; +#if notyet + db_save_stack_trace_at(dbst, DT_HOOK_FRAME_ADDRESS); +#else + db_save_stack_trace(dbst); +#endif + } +#endif /* DDB */ + + return dtev; +} + +void +dt_pcb_ring_consume(struct dt_pcb *dp, struct dt_evt *dtev) +{ + MUTEX_ASSERT_LOCKED(&dp->dp_mtx); + KASSERT(dtev == &dp->dp_ring[dp->dp_cons]); + + dp->dp_cons = (dp->dp_cons + 1) % DT_EVTRING_SIZE; + mtx_leave(&dp->dp_mtx); + + mtx_enter(&dp->dp_sc->ds_mtx); + dp->dp_sc->ds_evtcnt++; + mtx_leave(&dp->dp_sc->ds_mtx); + wakeup(dp->dp_sc); +} + +/* + * Copy at most `qlen' events from `dp', producing the same amount + * of free slots. + */ +int +dt_pcb_ring_copy(struct dt_pcb *dp, struct dt_evt *estq, size_t qlen, + uint64_t *dropped) +{ + size_t count, copied = 0; + unsigned int cons, prod; + + KASSERT(qlen > 0); + + mtx_enter(&dp->dp_mtx); + cons = dp->dp_cons; + prod = dp->dp_prod; + + if (cons < prod) + count = DT_EVTRING_SIZE - prod; + else + count = cons - prod; + + if (count == 0) + goto out; + + *dropped += dp->dp_dropevt; + dp->dp_dropevt = 0; + + count = MIN(count, qlen); + + memcpy(&estq[0], &dp->dp_ring[prod], count * sizeof(*estq)); + copied += count; + + /* Produce */ + prod = (prod + count) % DT_EVTRING_SIZE; + + /* If the queue is full or the ring didn't wrap, stop here. */ + if (qlen == copied || prod != 0 || cons == 0) + goto out; + + count = MIN(cons, (qlen - copied)); + memcpy(&estq[copied], &dp->dp_ring[0], count * sizeof(*estq)); + copied += count; + prod += count; + +out: + dp->dp_prod = prod; + mtx_leave(&dp->dp_mtx); + return copied; +} Index: sys/dev/dt/dt_prov_profile.c =================================================================== RCS file: sys/dev/dt/dt_prov_profile.c diff -N sys/dev/dt/dt_prov_profile.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/dev/dt/dt_prov_profile.c 16 Jan 2020 18:31:56 -0000 @@ -0,0 +1,147 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2019 Martin Pieuchot <m...@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/atomic.h> + +#include <dev/dt/dtvar.h> + +struct dt_probe *dtpp_profile; /* per-CPU profile probe */ +struct dt_probe *dtpp_interval; /* global periodic probe */ + +/* Flags that make sense for this provider */ +#define DTEVT_PROV_PROFILE DTEVT_KSTACK + +int dt_prov_profile_alloc(struct dt_probe *, struct dt_softc *, + struct dt_pcb_list *, struct dtioc_req *); +void dt_prov_profile_enter(struct dt_provider *, ...); +void dt_prov_interval_enter(struct dt_provider *, ...); + +struct dt_provider dt_prov_profile = { + .dtpv_name = "profile", + .dtpv_alloc = dt_prov_profile_alloc, + .dtpv_enter = dt_prov_profile_enter, + .dtpv_leave = NULL, +}; + +struct dt_provider dt_prov_interval = { + .dtpv_name = "interval", + .dtpv_alloc = dt_prov_profile_alloc, + .dtpv_enter = dt_prov_interval_enter, + .dtpv_leave = NULL, +}; + +int +dt_prov_profile_init(void) +{ + dtpp_profile = dt_dev_alloc_probe("hz", "97", &dt_prov_profile); + dt_dev_register_probe(dtpp_profile); + if (dtpp_profile == NULL) + return 0; + dtpp_interval = dt_dev_alloc_probe("hz", "1", &dt_prov_interval); + dt_dev_register_probe(dtpp_interval); + if (dtpp_interval == NULL) + return 1; + return 2; +} + +int +dt_prov_profile_alloc(struct dt_probe *dtp, struct dt_softc *sc, + struct dt_pcb_list *plist, struct dtioc_req *dtrq) +{ + struct dt_pcb *dp; + struct cpu_info *ci; + CPU_INFO_ITERATOR cii; + extern int hz; + + KASSERT(dtioc_req_isvalid(dtrq)); + KASSERT(TAILQ_EMPTY(plist)); + KASSERT(dtp == dtpp_profile || dtp == dtpp_interval); + + if (dtrq->dtrq_rate <= 0 || dtrq->dtrq_rate >= hz) + return EOPNOTSUPP; + + CPU_INFO_FOREACH(cii, ci) { + if (!CPU_IS_PRIMARY(ci) && (dtp == dtpp_interval)) + continue; + + dp = dt_pcb_alloc(dtp, sc); + if (dp == NULL) { + dt_pcb_purge(plist); + return ENOMEM; + } + + dp->dp_maxtick = dtrq->dtrq_rate; + dp->dp_cpuid = ci->ci_cpuid; + + dp->dp_filter = dtrq->dtrq_filter; + dp->dp_evtflags = dtrq->dtrq_evtflags & DTEVT_PROV_PROFILE; + TAILQ_INSERT_HEAD(plist, dp, dp_snext); + } + + return 0; +} + +static inline void +dt_prov_profile_fire(struct dt_pcb *dp) +{ + struct dt_evt *dtev; + + if (++dp->dp_nticks < dp->dp_maxtick) + return; + + dtev = dt_pcb_ring_get(dp); + if (dtev == NULL) + return; + dt_pcb_ring_consume(dp, dtev); + dp->dp_nticks = 0; +} + +void +dt_prov_profile_enter(struct dt_provider *dtpv, ...) +{ + struct cpu_info *ci = curcpu(); + struct dt_pcb *dp; + + KASSERT(dtpv == &dt_prov_profile); + + smr_read_enter(); + SMR_SLIST_FOREACH(dp, &dtpp_profile->dtp_pcbs, dp_pnext) { + if (dp->dp_cpuid != ci->ci_cpuid) + continue; + + dt_prov_profile_fire(dp); + } + smr_read_leave(); +} + +void +dt_prov_interval_enter(struct dt_provider *dtpv, ...) +{ + struct dt_pcb *dp; + + KASSERT(dtpv == &dt_prov_interval); + + smr_read_enter(); + SMR_SLIST_FOREACH(dp, &dtpp_interval->dtp_pcbs, dp_pnext) { + dt_prov_profile_fire(dp); + } + smr_read_leave(); +} Index: sys/dev/dt/dt_prov_static.c =================================================================== RCS file: sys/dev/dt/dt_prov_static.c diff -N sys/dev/dt/dt_prov_static.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/dev/dt/dt_prov_static.c 16 Jan 2020 18:31:56 -0000 @@ -0,0 +1,136 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2019 Martin Pieuchot <m...@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/tracepoint.h> + +#include <dev/dt/dtvar.h> + +int dt_prov_static_alloc(struct dt_probe *, struct dt_softc *, + struct dt_pcb_list *, struct dtioc_req *); +void dt_prov_static_hook(struct dt_provider *, ...); + +struct dt_provider dt_prov_static = { + .dtpv_name = "tracepoint", + .dtpv_alloc = dt_prov_static_alloc, + .dtpv_enter = dt_prov_static_hook, +}; + +/* + * Scheduler provider + */ +DT_STATIC_PROBE2(sched, dequeue, "pid_t", "pid_t"); +DT_STATIC_PROBE2(sched, enqueue, "pid_t", "pid_t"); +DT_STATIC_PROBE2(sched, off__cpu, "pid_t", "pid_t"); +DT_STATIC_PROBE0(sched, on__cpu); +DT_STATIC_PROBE0(sched, remain__cpu); +DT_STATIC_PROBE0(sched, sleep); +DT_STATIC_PROBE0(sched, wakeup); + +/* + * Raw syscalls + */ +DT_STATIC_PROBE1(raw_syscalls, sys_enter, "register_t"); +DT_STATIC_PROBE1(raw_syscalls, sys_exit, "register_t"); + +/* + * List of all static probes + */ +struct dt_probe *dtps_static[] = { + /* Scheduler */ + &_DT_STATIC_P(sched, dequeue), + &_DT_STATIC_P(sched, enqueue), + &_DT_STATIC_P(sched, off__cpu), + &_DT_STATIC_P(sched, on__cpu), + &_DT_STATIC_P(sched, remain__cpu), + &_DT_STATIC_P(sched, sleep), + &_DT_STATIC_P(sched, wakeup), + /* Raw syscalls */ + &_DT_STATIC_P(raw_syscalls, sys_enter), + &_DT_STATIC_P(raw_syscalls, sys_exit), +}; + +int +dt_prov_static_init(void) +{ + int i; + + for (i = 0; i < nitems(dtps_static); i++) + dt_dev_register_probe(dtps_static[i]); + + return i; +} + +int +dt_prov_static_alloc(struct dt_probe *dtp, struct dt_softc *sc, + struct dt_pcb_list *plist, struct dtioc_req *dtrq) +{ + struct dt_pcb *dp; + + KASSERT(dtioc_req_isvalid(dtrq)); + KASSERT(TAILQ_EMPTY(plist)); + + dp = dt_pcb_alloc(dtp, sc); + if (dp == NULL) + return ENOMEM; + + dp->dp_filter = dtrq->dtrq_filter; + dp->dp_evtflags = dtrq->dtrq_evtflags; + TAILQ_INSERT_HEAD(plist, dp, dp_snext); + + return 0; +} + +void +dt_prov_static_hook(struct dt_provider *dtpv, ...) +{ + struct dt_probe *dtp; + struct dt_pcb *dp; + uintptr_t args[5]; + va_list ap; + int i; + + va_start(ap, dtpv); + dtp = va_arg(ap, struct dt_probe *); + for (i = 0; i < dtp->dtp_nargs; i++) { + args[i] = va_arg(ap, uintptr_t); + } + va_end(ap); + + KASSERT(dtpv == dtp->dtp_prov); + + smr_read_enter(); + SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { + struct dt_evt *dtev; + + dtev = dt_pcb_ring_get(dp); + if (dtev == NULL) + continue; + + dtev->dtev_sysargs[0] = args[0]; + dtev->dtev_sysargs[1] = args[1]; + dtev->dtev_sysargs[2] = args[2]; + dtev->dtev_sysargs[3] = args[3]; + dtev->dtev_sysargs[4] = args[4]; + + dt_pcb_ring_consume(dp, dtev); + } + smr_read_leave(); +} Index: sys/dev/dt/dt_prov_syscall.c =================================================================== RCS file: sys/dev/dt/dt_prov_syscall.c diff -N sys/dev/dt/dt_prov_syscall.c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/dev/dt/dt_prov_syscall.c 16 Jan 2020 18:31:56 -0000 @@ -0,0 +1,206 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2019 Martin Pieuchot <m...@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/param.h> +#include <sys/malloc.h> +#include <sys/atomic.h> +#include <sys/syscall.h> + +#include <dev/dt/dtvar.h> + +extern struct sysent sysent[]; + +/* Arrays of probes per syscall. */ +struct dt_probe **dtps_entry; +struct dt_probe **dtps_return; +unsigned int dtps_nsysent = SYS_MAXSYSCALL; + +/* Flags that make sense for this provider */ +#define DTEVT_PROV_SYSCALL (DTEVT_COMMON|DTEVT_FUNCARGS|DTEVT_RETVAL) + +int dt_prov_syscall_alloc(struct dt_probe *, struct dt_softc *, + struct dt_pcb_list *, struct dtioc_req *); +void dt_prov_syscall_entry(struct dt_provider *, ...); +void dt_prov_syscall_return(struct dt_provider *, ...); + +struct dt_provider dt_prov_syscall = { + .dtpv_name = "syscall", + .dtpv_alloc = dt_prov_syscall_alloc, + .dtpv_enter = dt_prov_syscall_entry, + .dtpv_leave = dt_prov_syscall_return, +}; + +int +dt_prov_syscall_init(void) +{ + struct dt_probe *dtp; + int i, len, nprobes = 0; + char *sysnb; + + dtps_entry = mallocarray(dtps_nsysent, sizeof(dtp), M_DT, + M_NOWAIT|M_ZERO); + if (dtps_entry == NULL) + return 0; + dtps_return = mallocarray(dtps_nsysent, sizeof(dtp), M_DT, + M_NOWAIT|M_ZERO); + if (dtps_return == NULL) { + free(dtps_entry, M_DT, dtps_nsysent * sizeof(dtp)); + return 0; + } + + for (i = 0; i < dtps_nsysent; i++) { + if (sysent[i].sy_call == sys_nosys) + continue; + + len = snprintf(NULL, 0, "sys%%%u", i); + sysnb = malloc(len + 1, M_DT, M_NOWAIT); + if (sysnb == NULL) + break; + snprintf(sysnb, len + 1, "sys%%%u", i); + dtp = dt_dev_alloc_probe(sysnb, "entry", &dt_prov_syscall); + if (dtp == NULL) { + free(sysnb, M_DT, len); + break; + } + dtp->dtp_sysnum = i; + dtps_entry[i] = dtp; + dt_dev_register_probe(dtp); + nprobes++; + dtp = dt_dev_alloc_probe(sysnb, "return", &dt_prov_syscall); + if (dtp == NULL) + break; + dtp->dtp_sysnum = i; + dtps_return[i] = dtp; + dt_dev_register_probe(dtp); + nprobes++; + } + + return nprobes; +} + +int +dt_prov_syscall_alloc(struct dt_probe *dtp, struct dt_softc *sc, + struct dt_pcb_list *plist, struct dtioc_req *dtrq) +{ + struct dt_pcb *dp; + + KASSERT(dtioc_req_isvalid(dtrq)); + KASSERT(TAILQ_EMPTY(plist)); + KASSERT(dtp->dtp_prov == &dt_prov_syscall); + KASSERT((dtp->dtp_sysnum >= 0) && (dtp->dtp_sysnum < dtps_nsysent)); + + dp = dt_pcb_alloc(dtp, sc); + if (dp == NULL) + return ENOMEM; + + dp->dp_filter = dtrq->dtrq_filter; + dp->dp_evtflags = dtrq->dtrq_evtflags & DTEVT_PROV_SYSCALL; + TAILQ_INSERT_HEAD(plist, dp, dp_snext); + + + return 0; +} + +void +dt_prov_syscall_entry(struct dt_provider *dtpv, ...) +{ + struct dt_probe *dtp; + struct dt_pcb *dp; + register_t sysnum; + size_t argsize; + register_t *args; + va_list ap; + + KASSERT(dtpv == &dt_prov_syscall); + va_start(ap, dtpv); + sysnum = va_arg(ap, register_t); + argsize = va_arg(ap, size_t); + args = va_arg(ap, register_t*); + va_end(ap); + + KASSERT((argsize / sizeof(register_t)) <= DTMAXSYSARGS); + + if (sysnum < 0 || sysnum >= dtps_nsysent) + return; + + dtp = dtps_entry[sysnum]; + if (!dtp->dtp_recording) + return; + + smr_read_enter(); + SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { + struct dt_evt *dtev; + + dtev = dt_pcb_ring_get(dp); + if (dtev == NULL) + continue; + + if (ISSET(dp->dp_evtflags, DTEVT_FUNCARGS)) + memcpy(dtev->dtev_sysargs, args, argsize); + + dt_pcb_ring_consume(dp, dtev); + } + smr_read_leave(); +} + +void +dt_prov_syscall_return(struct dt_provider *dtpv, ...) +{ + struct dt_probe *dtp; + struct dt_pcb *dp; + register_t sysnum; + int error; + register_t retval[2]; + va_list ap; + + KASSERT(dtpv == &dt_prov_syscall); + + va_start(ap, dtpv); + sysnum = va_arg(ap, register_t); + error = va_arg(ap, int); + retval[0] = va_arg(ap, register_t); + retval[1] = va_arg(ap, register_t); + va_end(ap); + + if (sysnum < 0 || sysnum >= dtps_nsysent) + return; + + dtp = dtps_return[sysnum]; + if (!dtp->dtp_recording) + return; + + smr_read_enter(); + SMR_SLIST_FOREACH(dp, &dtp->dtp_pcbs, dp_pnext) { + struct dt_evt *dtev; + + dtev = dt_pcb_ring_get(dp); + if (dtev == NULL) + continue; + + if (ISSET(dp->dp_evtflags, DTEVT_RETVAL)) { + dtev->dtev_sysretval[0] = retval[0]; + dtev->dtev_sysretval[1] = retval[1]; + dtev->dtev_syserror = error; + } + + dt_pcb_ring_consume(dp, dtev); + } + smr_read_leave(); +} Index: sys/dev/dt/dtvar.h =================================================================== RCS file: sys/dev/dt/dtvar.h diff -N sys/dev/dt/dtvar.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/dev/dt/dtvar.h 16 Jan 2020 18:31:56 -0000 @@ -0,0 +1,325 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2019 Martin Pieuchot <m...@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _DT_H_ +#define _DT_H_ + +#include <sys/ioccom.h> +#include <sys/time.h> + +/* + * Length of provider/probe/function names, including terminating '\0'. + */ +#define DTNAMESIZE 16 + +/* + * Length of process name, keep in sync with MAXCOMLEN. + */ +#define DTMAXCOMLEN 16 + +/* + * Maximum number of arguments passed to a syscall. + */ +#define DTMAXSYSARGS 10 + +/* + * XXX ABI compatible with db_save_stack_trace(); + */ +#define DT_STACK_TRACE_MAX 19 +struct dt_stack_trace { + unsigned int st_count; + unsigned long st_pc[DT_STACK_TRACE_MAX]; +}; + +/* + * Event state: where to store information when a probe fires. + */ +struct dt_evt { + unsigned int dtev_pbn; /* Probe number */ + unsigned int dtev_cpu; /* CPU id */ + pid_t dtev_pid; /* ID of current process */ + pid_t dtev_tid; /* ID of current thread */ + struct timespec dtev_tsp; /* timestamp (nsecs) */ + + /* + * Recorded if the corresponding flag is set. + */ + struct dt_stack_trace dtev_kstack; /* kernel stack frame */ + char dtev_comm[DTMAXCOMLEN+1]; /* current pr. name */ + union { + register_t E_entry[DTMAXSYSARGS]; + struct { + register_t __retval[2]; + int __error; + } E_return; + } _sys; +#define dtev_sysargs _sys.E_entry /* syscall args. */ +#define dtev_sysretval _sys.E_return.__retval /* syscall retval */ +#define dtev_syserror _sys.E_return.__error /* syscall error */ + +}; + +/* + * States to record when a probe fires. + */ +#define DTEVT_EXECNAME (1 << 0) /* current process name */ +#define DTEVT_USTACK (1 << 1) /* userland stack */ +#define DTEVT_KSTACK (1 << 2) /* kernel stack */ +#define DTEVT_FUNCARGS (1 << 3) /* function arguments */ +#define DTEVT_RETVAL (1 << 4) /* function retval & error */ + +#define DTEVT_FLAG_BITS \ + "\020" \ + "\001EXECNAME" \ + "\002USTACK" \ + "\003KSTACK" \ + "\004FUNCARGS" \ + "\005RETVAL" \ + +/* + * Each PCB can have a filter attached to itself. A filter do not + * prevent an enabled probe to fire, but when that happens, event + * states are only recorded if it is matched. + */ +struct dt_filter { + enum dt_operand { + DT_OP_NONE = 0, + DT_OP_EQ, + DT_OP_NE, + } dtf_operand; + enum dt_filtervar { + DT_FV_NONE = 0, + DT_FV_PID, + DT_FV_TID, + } dtf_variable /* what should be filtered */; + unsigned int dtf_value; /* PID or TID to filter */ +}; + + +struct dtioc_probe_info { + uint32_t dtpi_pbn; /* Probe number */ + char dtpi_prov[DTNAMESIZE]; + char dtpi_func[DTNAMESIZE]; + char dtpi_name[DTNAMESIZE]; +}; + +struct dtioc_probe { + size_t dtpr_size; /* size of the buffer */ + struct dtioc_probe_info *dtpr_probes; /* array of probe info */ +}; + +struct dtioc_req { + uint32_t dtrq_pbn; /* probe number */ + struct dt_filter dtrq_filter; /* probe filter */ + uint32_t dtrq_rate; /* number of ticks */ + uint64_t dtrq_evtflags; /* states to record */ +}; + +struct dtioc_stat { + uint64_t dtst_readevt; /* events read */ + uint64_t dtst_dropevt; /* events dropped */ +}; + +#define DTIOCGPLIST _IOWR('D', 1, struct dtioc_probe) +#define DTIOCGSTATS _IOR('D', 2, struct dtioc_stat) + +#define DTIOCRECORD _IOW('D', 3, int) +#define DTIOCPRBENABLE _IOW('D', 4, struct dtioc_req) + + +#ifdef _KERNEL + +#include <sys/mutex.h> +#include <sys/queue.h> +#include <sys/smr.h> + +/* Flags that make sense for all providers. */ +#define DTEVT_COMMON (DTEVT_EXECNAME|DTEVT_KSTACK|DTEVT_USTACK) + +#define M_DT M_DEVBUF /* XXX FIXME */ + +struct dt_softc; + +int dtioc_req_isvalid(struct dtioc_req *); + +/* + * Probe control block, possibly per-CPU. + * + * At least a PCB is allocated for each probe enabled via the DTIOCPRBENABLE + * ioctl(2). It will hold the events written when the probe fires until + * userland read(2)s them. + * + * Locks used to protect struct members in this file: + * I immutable after creation + * k kernel lock + * k,s kernel lock for writting and SMR for reading + * m per-pcb mutex + * c owned (read & modified) by a single CPU + */ +struct dt_pcb { + SMR_SLIST_ENTRY(dt_pcb) dp_pnext; /* [k,s] next PCB per probe */ + TAILQ_ENTRY(dt_pcb) dp_snext; /* [k] next PCB per softc */ + + /* Event states ring */ + unsigned int dp_prod; /* [m] read index */ + unsigned int dp_cons; /* [m] write index */ + struct dt_evt *dp_ring; /* [m] ring of event sates */ + struct mutex dp_mtx; + + struct dt_softc *dp_sc; /* [I] related softc */ + struct dt_probe *dp_dtp; /* [I] related probe */ + uint64_t dp_evtflags; /* [I] event states to record */ + struct dt_filter dp_filter; /* [I] filter to match */ + + /* Provider specific fields. */ + unsigned int dp_cpuid; /* [I] on which CPU */ + unsigned int dp_maxtick; /* [I] freq. of profiling */ + unsigned int dp_nticks; /* [c] current tick count */ + + /* Counters */ + uint64_t dp_dropevt; /* [m] # dropped event */ +}; + +TAILQ_HEAD(dt_pcb_list, dt_pcb); + +struct dt_pcb *dt_pcb_alloc(struct dt_probe *, struct dt_softc *); +void dt_pcb_free(struct dt_pcb *); +void dt_pcb_purge(struct dt_pcb_list *); +int dt_pcb_filter(struct dt_pcb *); + +struct dt_evt *dt_pcb_ring_get(struct dt_pcb *); +void dt_pcb_ring_consume(struct dt_pcb *, struct dt_evt *); + +/* + * Probes are entry points in the system where events can be recorded. + * + * Locks used to protect struct members in this file: + * I immutable after creation + * k kernel lock + * d dt_lock + * d,s dt_lock for writting and SMR for reading + */ +struct dt_probe { + SIMPLEQ_ENTRY(dt_probe) dtp_next; /* [k] global list of probes */ + SMR_SLIST_HEAD(, dt_pcb) dtp_pcbs; /* [d,s] list of enabled PCBs */ + struct dt_provider *dtp_prov; /* [I] its to provider */ + const char *dtp_func; /* [I] probe function */ + const char *dtp_name; /* [I] probe name */ + uint32_t dtp_pbn; /* [I] unique ID */ + volatile uint32_t dtp_recording; /* [d] is it recording? */ + + /* Provider specific fields. */ + int dtp_sysnum; /* [I] related # of syscall */ + const char *dtp_argtype[5];/* [I] type of arguments */ + int dtp_nargs; /* [I] # of arguments */ +}; + + +/* + * Providers expose a set of probes and a method to record events. + */ +struct dt_provider { + const char *dtpv_name; /* [I] provider name */ + volatile uint32_t dtpv_recording;/* [d] # of recording PCBs */ + + int (*dtpv_alloc)(struct dt_probe *, struct dt_softc *, + struct dt_pcb_list *, struct dtioc_req *); + void (*dtpv_enter)(struct dt_provider *, ...); + void (*dtpv_leave)(struct dt_provider *, ...); +}; + +int dt_prov_profile_init(void); +int dt_prov_syscall_init(void); +int dt_prov_static_init(void); + +struct dt_probe *dt_dev_alloc_probe(const char *, const char *, + struct dt_provider *); +void dt_dev_register_probe(struct dt_probe *); + + +extern volatile uint32_t dt_tracing; /* currently tracing? */ + +#define DT_ENTER(provname, args...) do { \ + extern struct dt_provider dt_prov_ ## provname ; \ + struct dt_provider *dtpv = &dt_prov_ ## provname ; \ + \ + if (__predict_false(dt_tracing) && \ + __predict_false(dtpv->dtpv_recording)) { \ + dtpv->dtpv_enter(dtpv, args); \ + } \ +} while (0) + +#define DT_LEAVE(provname, args...) do { \ + extern struct dt_provider dt_prov_ ## provname ; \ + struct dt_provider *dtpv = &dt_prov_ ## provname ; \ + \ + if (__predict_false(dt_tracing) && \ + __predict_false(dtpv->dtpv_recording)) { \ + dtpv->dtpv_leave(dtpv, args); \ + } \ +} while (0) + +#define _DT_STATIC_P(func, name) (dt_static_##func##_##name) + +/* + * Probe definition for the static provider. + */ +#define _DT_STATIC_PROBEN(func, name, arg0, arg1, arg2, arg3, arg4, n) \ + struct dt_probe _DT_STATIC_P(func, name) = { \ + .dtp_next = { NULL }, \ + .dtp_pcbs = { NULL }, \ + .dtp_prov = &dt_prov_static, \ + .dtp_func = #func, \ + .dtp_name = #name, \ + .dtp_pbn = 0, \ + .dtp_sysnum = 0, \ + .dtp_argtype = { arg0, arg1, arg2, arg3, arg4 }, \ + .dtp_nargs = n, \ + } \ + +#define DT_STATIC_PROBE0(func, name) \ + _DT_STATIC_PROBEN(func, name, NULL, NULL, NULL, NULL, NULL, 0) + +#define DT_STATIC_PROBE1(func, name, arg0) \ + _DT_STATIC_PROBEN(func, name, arg0, NULL, NULL, NULL, NULL, 1) + +#define DT_STATIC_PROBE2(func, name, arg0, arg1) \ + _DT_STATIC_PROBEN(func, name, arg0, arg1, NULL, NULL, NULL, 2) + +#define DT_STATIC_PROBE3(func, name, arg0, arg1, arg2) \ + _DT_STATIC_PROBEN(func, name, arg0, arg1, arg2, NULL, NULL, 3) + +#define DT_STATIC_PROBE4(func, name, arg0, arg1, arg2, arg3) \ + _DT_STATIC_PROBEN(func, name, arg0, arg1, arg2, arg3, NULL, 4) + +#define DT_STATIC_PROBE5(func, name, arg0, arg1, arg2, arg3, arg4) \ + _DT_STATIC_PROBEN(func, name, arg0, arg1, arg2, arg3, arg4, 5) + +#define DT_STATIC_ENTER(func, name, args...) do { \ + extern struct dt_probe _DT_STATIC_P(func, name); \ + struct dt_probe *dtp = &_DT_STATIC_P(func, name); \ + struct dt_provider *dtpv = dtp->dtp_prov; \ + \ + if (__predict_false(dt_tracing) && \ + __predict_false(dtp->dtp_recording)) { \ + dtpv->dtpv_enter(dtpv, dtp, args); \ + } \ +} while (0) + +#endif /* !_KERNEL */ +#endif /* !_DT_H_ */ Index: sys/kern/kern_clock.c =================================================================== RCS file: /cvs/src/sys/kern/kern_clock.c,v retrieving revision 1.100 diff -u -p -r1.100 kern_clock.c --- sys/kern/kern_clock.c 2 Nov 2019 16:56:17 -0000 1.100 +++ sys/kern/kern_clock.c 16 Jan 2020 18:31:56 -0000 @@ -55,6 +55,11 @@ #include <sys/gmon.h> #endif +#include "dt.h" +#if NDT > 0 +#include <dev/dt/dtvar.h> +#endif + /* * Clock handling routines. * @@ -167,6 +172,12 @@ hardclock(struct clockframe *frame) if (--ci->ci_schedstate.spc_rrticks <= 0) roundrobin(ci); + +#if NDT > 0 + DT_ENTER(profile, NULL); + if (CPU_IS_PRIMARY(ci)) + DT_ENTER(interval, NULL); +#endif /* * If we are not the primary CPU, we're not allowed to do Index: sys/kern/kern_sched.c =================================================================== RCS file: /cvs/src/sys/kern/kern_sched.c,v retrieving revision 1.62 diff -u -p -r1.62 kern_sched.c --- sys/kern/kern_sched.c 4 Nov 2019 18:06:03 -0000 1.62 +++ sys/kern/kern_sched.c 16 Jan 2020 18:31:56 -0000 @@ -26,6 +26,7 @@ #include <sys/mutex.h> #include <sys/task.h> #include <sys/smr.h> +#include <sys/tracepoint.h> #include <uvm/uvm_extern.h> @@ -261,6 +262,7 @@ setrunqueue(struct cpu_info *ci, struct spc = &p->p_cpu->ci_schedstate; spc->spc_nrun++; + TRACEPOINT(sched, enqueue, p->p_tid, p->p_p->ps_pid); TAILQ_INSERT_TAIL(&spc->spc_qs[queue], p, p_runq); spc->spc_whichqs |= (1 << queue); @@ -282,6 +284,7 @@ remrunqueue(struct proc *p) SCHED_ASSERT_LOCKED(); spc = &p->p_cpu->ci_schedstate; spc->spc_nrun--; + TRACEPOINT(sched, dequeue, p->p_tid, p->p_p->ps_pid); TAILQ_REMOVE(&spc->spc_qs[queue], p, p_runq); if (TAILQ_EMPTY(&spc->spc_qs[queue])) { Index: sys/kern/kern_synch.c =================================================================== RCS file: /cvs/src/sys/kern/kern_synch.c,v retrieving revision 1.158 diff -u -p -r1.158 kern_synch.c --- sys/kern/kern_synch.c 16 Jan 2020 16:35:04 -0000 1.158 +++ sys/kern/kern_synch.c 16 Jan 2020 18:31:56 -0000 @@ -51,6 +51,8 @@ #include <sys/refcnt.h> #include <sys/atomic.h> #include <sys/witness.h> +#include <sys/tracepoint.h> + #include <ddb/db_output.h> #include <machine/spinlock.h> @@ -380,6 +382,8 @@ sleep_setup(struct sleep_state *sls, con SCHED_LOCK(sls->sls_s); + TRACEPOINT(sched, sleep, NULL); + p->p_wchan = ident; p->p_wmesg = wmesg; p->p_slptime = 0; @@ -552,6 +556,7 @@ unsleep(struct proc *p) if (p->p_wchan != NULL) { TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_runq); p->p_wchan = NULL; + TRACEPOINT(sched, wakeup, p->p_tid, p->p_p->ps_pid); } } Index: sys/kern/sched_bsd.c =================================================================== RCS file: /cvs/src/sys/kern/sched_bsd.c,v retrieving revision 1.60 diff -u -p -r1.60 sched_bsd.c --- sys/kern/sched_bsd.c 11 Dec 2019 07:30:09 -0000 1.60 +++ sys/kern/sched_bsd.c 16 Jan 2020 18:31:56 -0000 @@ -48,6 +48,7 @@ #include <sys/sched.h> #include <sys/timeout.h> #include <sys/smr.h> +#include <sys/tracepoint.h> #ifdef KTRACE #include <sys/ktrace.h> @@ -392,8 +393,12 @@ mi_switch(void) if (p != nextproc) { uvmexp.swtch++; + TRACEPOINT(sched, off__cpu, nextproc->p_tid, + nextproc->p_p->ps_pid); cpu_switchto(p, nextproc); + TRACEPOINT(sched, on__cpu, NULL); } else { + TRACEPOINT(sched, remain__cpu, NULL); p->p_stat = SONPROC; } Index: sys/sys/conf.h =================================================================== RCS file: /cvs/src/sys/sys/conf.h,v retrieving revision 1.146 diff -u -p -r1.146 conf.h --- sys/sys/conf.h 17 Dec 2019 13:08:54 -0000 1.146 +++ sys/sys/conf.h 16 Jan 2020 18:31:56 -0000 @@ -489,6 +489,13 @@ extern struct cdevsw cdevsw[]; (dev_type_stop((*))) enodev, 0, selfalse, \ (dev_init(c,n,mmap)), 0, D_CLONE } +/* open, close, read, ioctl */ +#define cdev_dt_init(c,n) { \ + dev_init(c,n,open), dev_init(c,n,close), dev_init(c,n,read), \ + (dev_type_write((*))) enodev, dev_init(c,n,ioctl), \ + (dev_type_stop((*))) enodev, 0, selfalse, \ + (dev_type_mmap((*))) enodev, 0, D_CLONE } + #endif /* @@ -581,6 +588,8 @@ cdev_decl(rd); bdev_decl(uk); cdev_decl(uk); + +cdev_decl(dt); cdev_decl(diskmap); Index: sys/sys/syscall_mi.h =================================================================== RCS file: /cvs/src/sys/sys/syscall_mi.h,v retrieving revision 1.24 diff -u -p -r1.24 syscall_mi.h --- sys/sys/syscall_mi.h 29 Nov 2019 06:34:46 -0000 1.24 +++ sys/sys/syscall_mi.h 16 Jan 2020 18:31:56 -0000 @@ -33,12 +33,18 @@ #include <sys/param.h> #include <sys/pledge.h> +#include <sys/tracepoint.h> #include <uvm/uvm_extern.h> #ifdef KTRACE #include <sys/ktrace.h> #endif +#include "dt.h" +#if NDT > 0 +#include <dev/dt/dtvar.h> +#endif + /* * The MD setup for a system call has been done; here's the MI part. @@ -59,6 +65,10 @@ mi_syscall(struct proc *p, register_t co scdebug_call(p, code, argp); KERNEL_UNLOCK(); #endif + TRACEPOINT(raw_syscalls, sys_enter, code, NULL); +#if NDT > 0 + DT_ENTER(syscall, code, callp->sy_argsize, argp); +#endif #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) { KERNEL_LOCK(); @@ -108,6 +118,10 @@ mi_syscall_return(struct proc *p, regist scdebug_ret(p, code, error, retval); KERNEL_UNLOCK(); #endif +#if NDT > 0 + DT_LEAVE(syscall, code, error, retval[0], retval[1]); +#endif + TRACEPOINT(raw_syscalls, sys_exit, code, NULL); userret(p); @@ -126,17 +140,23 @@ mi_syscall_return(struct proc *p, regist static inline void mi_child_return(struct proc *p) { -#if defined(SYSCALL_DEBUG) || defined(KTRACE) +#if defined(SYSCALL_DEBUG) || defined(KTRACE) || NDT > 0 int code = (p->p_flag & P_THREAD) ? SYS___tfork : (p->p_p->ps_flags & PS_PPWAIT) ? SYS_vfork : SYS_fork; const register_t child_retval[2] = { 0, 1 }; #endif + TRACEPOINT(sched, on__cpu, NULL); + #ifdef SYSCALL_DEBUG KERNEL_LOCK(); scdebug_ret(p, code, 0, child_retval); KERNEL_UNLOCK(); #endif +#if NDT > 0 + DT_LEAVE(syscall, code, 0, child_retval[0], child_retval[1]); +#endif + TRACEPOINT(raw_syscalls, sys_exit, code, NULL); userret(p); Index: sys/sys/tracepoint.h =================================================================== RCS file: sys/sys/tracepoint.h diff -N sys/sys/tracepoint.h --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ sys/sys/tracepoint.h 16 Jan 2020 18:34:15 -0000 @@ -0,0 +1,36 @@ +/* $OpenBSD$ */ + +/* + * Copyright (c) 2019 Martin Pieuchot <m...@openbsd.org> + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef _SYS_TRACEPOINT_H_ +#define _SYS_TRACEPOINT_H_ + +#ifdef _KERNEL + +#include "dt.h" +#if NDT > 0 +#include <dev/dt/dtvar.h> + +#define TRACEPOINT(func, name, args...) DT_STATIC_ENTER(func, name, args) + +#else /* NDT > 0 */ + +#define TRACEPOINT(func, name, args...) + +#endif /* NDT > 0 */ +#endif /* _KERNEL */ +#endif /* _SYS_TRACEPOINT_H_ */