Le sam. 9 mars 2024 à 15:03, Luca Dariz <l...@orpolo.org> a écrit :
> This extends the previous debug utility to trace system calls with > more events and the ability to control the tracing from userspace, > collecting a trace of the whole systems. > > This tool was quite useful in porting the rpc format to 64 bit and > handle the 32to64 translation, but also to debug user-space programs. > > It's still a draft with some limitations and open points: > - DEBUG_TRACE for 64 bit is missing > - check if we can have meaningful and reliable data removing the > syscall number on syscall return; this would avoid saving it in the > thread structure. > - add more info to each trace entry, e.g. current CPU or timestamps > - trace more events > - add more filtering capabilities > - trace a single subhurd > - trace format is not exported > --- > Makefrag.am | 7 + > configfrag.ac | 12 ++ > i386/i386/debug.h | 16 +- > i386/i386/debug_i386.c | 45 ++-- > i386/i386/locore.S | 33 ++- > i386/i386/pcb.c | 4 + > i386/i386/trap.c | 5 + > include/mach/gnumach.defs | 33 +++ > include/mach/trace.h | 61 ++++++ > ipc/ipc_kmsg.c | 6 +- > ipc/mach_msg.c | 2 + > kern/exception.c | 1 + > kern/startup.c | 2 + > kern/thread.h | 4 + > kern/trace.c | 432 ++++++++++++++++++++++++++++++++++++++ > kern/trace.h | 42 ++++ > scripts/gnumach-gdb.py | 63 ++++++ > tests/test-trace.c | 58 +++++ > tests/user-qemu.mk | 3 +- > x86_64/debug_trace.S | 9 +- > x86_64/interrupt.S | 19 ++ > x86_64/locore.S | 84 +++++++- > 22 files changed, 887 insertions(+), 54 deletions(-) > create mode 100644 include/mach/trace.h > create mode 100644 kern/trace.c > create mode 100644 kern/trace.h > create mode 100644 scripts/gnumach-gdb.py > create mode 100644 tests/test-trace.c > > diff --git a/Makefrag.am b/Makefrag.am > index 82fce628..25d73ec3 100644 > --- a/Makefrag.am > +++ b/Makefrag.am > @@ -221,6 +221,12 @@ libkernel_a_SOURCES += \ > kern/xpr.h \ > kern/elf-load.c \ > kern/boot_script.c > + > +if KERNEL_TRACE > +libkernel_a_SOURCES += \ > + kern/trace.c > +endif > + > EXTRA_DIST += \ > kern/exc.defs \ > kern/mach.srv \ > @@ -407,6 +413,7 @@ include_mach_HEADERS = \ > include/mach/profilparam.h \ > include/mach/std_types.h \ > include/mach/syscall_sw.h \ > + include/mach/trace.h \ > include/mach/task_info.h \ > include/mach/task_special_ports.h \ > include/mach/thread_info.h \ > diff --git a/configfrag.ac b/configfrag.ac > index b8b41261..f4dbe63d 100644 > --- a/configfrag.ac > +++ b/configfrag.ac > @@ -139,6 +139,18 @@ AC_DEFINE([SLAB_VERIFY], [0], [SLAB_VERIFY]) > > # Enable the CPU pool layer in the slab allocator. > AC_DEFINE([SLAB_USE_CPU_POOLS], [0], [SLAB_USE_CPU_POOLS]) > + > +# enable support for tracing various kernel events > +AC_ARG_ENABLE([kernel-trace], > + AS_HELP_STRING([--enable-kernel-trace], [Enable kernel event tracing])) > +[if [ x"$enable_kernel_trace" = xyes ]; then] > + AC_DEFINE([KERNEL_TRACE], [1], [Enable kernel event tracing]) > + AM_CONDITIONAL([KERNEL_TRACE], [true]) > +[else] > + AC_DEFINE([KERNEL_TRACE], [0], [Enable kernel event tracing]) > + AM_CONDITIONAL([KERNEL_TRACE], [false]) > +[fi] > + > > # > # Options. > diff --git a/i386/i386/debug.h b/i386/i386/debug.h > index 84397ba8..eff330c6 100644 > --- a/i386/i386/debug.h > +++ b/i386/i386/debug.h > @@ -54,6 +54,7 @@ void debug_trace_dump(void); > > #else /* __ASSEMBLER__ */ > > +#ifndef __x86_64__ > #define DEBUG_TRACE \ > pushl $__LINE__ ;\ > pushl $9f ;\ > @@ -62,10 +63,21 @@ void debug_trace_dump(void); > .data ;\ > 9: .ascii __FILE__"\0" ;\ > .text > - > +#else /* __x86_64__ */ > +#define DEBUG_TRACE \ > + pushq %rdi ;\ > + pushq %rs1 ;\ > Is it really %rs1 here and not %rsi ? you are pushing rs1 but popping rsi > + movq $__LINE__,%rdi ;\ > + movq $9f,%rsi ;\ > + call __debug_trace ;\ > + popq %rsi ;\ > + popq %rdi ;\ > + .data ;\ > +9: .ascii __FILE__"\0" ;\ > + .text > +#endif > #endif /* __ASSEMBLER__ */ > > - > #endif /* DEBUG */ > > /* XXX #include_next "debug.h" */ > diff --git a/i386/i386/debug_i386.c b/i386/i386/debug_i386.c > index 41d032e3..61d73dff 100644 > --- a/i386/i386/debug_i386.c > +++ b/i386/i386/debug_i386.c > @@ -22,6 +22,7 @@ > */ > > #include <kern/printf.h> > +#include <kern/trace.h> > > #include "thread.h" > #include "trap.h" > @@ -140,39 +141,23 @@ debug_trace_dump(void) > > splx(s); > } > +#endif /* DEBUG */ > > -#include <kern/syscall_sw.h> > - > -int syscall_trace = 0; > -task_t syscall_trace_task; > +#if KERNEL_TRACE > > -int > -syscall_trace_print(int syscallvec, ...) > +#ifndef __x86_64__ > +#define SYSCALL_NUM_SHIFT 4 > +#else > +#define SYSCALL_NUM_SHIFT 5 > +#endif > +long > +syscall_trace_enter(long *sysv) > { > - int syscallnum = syscallvec >> 4; > - int i; > - const mach_trap_t *trap = &mach_trap_table[syscallnum]; > - > - if (syscall_trace_task && syscall_trace_task != current_task()) > - goto out; > - > - printf("0x%08x:0x%08x:%s(", > - current_task(), current_thread(), trap->mach_trap_name); > - for (i = 0; i < trap->mach_trap_arg_count; i++) { > - unsigned long value = (&syscallvec)[1+i]; > - /* Use a crude heuristic to format pointers. */ > - if (value > 1024) > - printf("0x%08x", value); > - else > - printf("%d", value); > - > - if (i + 1 < trap->mach_trap_arg_count) > - printf(", "); > - } > - printf(")\n"); > + long sysnum = sysv[0]; > > - out: > - return syscallvec; > + syscall_trace_common(sysnum >> SYSCALL_NUM_SHIFT, sysv + 1); > + > + return sysnum; > } > > -#endif /* DEBUG */ > +#endif /* KERNEL_TRACE */ > diff --git a/i386/i386/locore.S b/i386/i386/locore.S > index 9d0513a1..26c5843c 100644 > --- a/i386/i386/locore.S > +++ b/i386/i386/locore.S > @@ -634,6 +634,15 @@ ENTRY(thread_bootstrap_return) > */ > > ENTRY(thread_syscall_return) > +#if KERNEL_TRACE > + testb $0xff,EXT(syscall_trace) > + jz 1f > + movl S_ARG0,%eax /* get return value */ > + pushl %eax > + call syscall_trace_return > + movl %eax,S_ARG0 /* restore return value */ > +1: > +#endif > movl S_ARG0,%eax /* get return value */ > movl %esp,%ecx /* get kernel stack */ > or $(KERNEL_STACK_SIZE-1),%ecx > @@ -1174,18 +1183,26 @@ syscall_native: > > mach_call_call: > > -#ifdef DEBUG > +#if KERNEL_TRACE > testb $0xff,EXT(syscall_trace) > jz 0f > - pushl %eax > - call EXT(syscall_trace_print) > - /* will return with syscallofs still (or again) in eax */ > - addl $4,%esp > + pushl %eax /* add syscall num to args array > */ > + pushl %esp /* args array is the first > argument*/ > + call EXT(syscall_trace_enter) > + popl %eax > + popl %eax > you are popping eax twice here, shouldn’t the second one be esp? > 0: > -#endif /* DEBUG */ > +#endif /* KERNEL_TRACE */ > + call *EXT(mach_trap_table)+4(%eax) /* call procedure */ > +#if KERNEL_TRACE > + testb $0xff,EXT(syscall_trace) > + jz 1f > + pushl %eax > + call syscall_trace_return > + /* will return with return value still (or again) in eax */ > +1: > +#endif /* KERNEL_TRACE */ > > - call *EXT(mach_trap_table)+4(%eax) > - /* call procedure */ > movl %esp,%ecx /* get kernel stack */ > or $(KERNEL_STACK_SIZE-1),%ecx > movl -3-IKS_SIZE(%ecx),%esp /* switch back to PCB stack */ > diff --git a/i386/i386/pcb.c b/i386/i386/pcb.c > index e8901550..a3ed7d67 100644 > --- a/i386/i386/pcb.c > +++ b/i386/i386/pcb.c > @@ -37,6 +37,7 @@ > #include <kern/counters.h> > #include <kern/debug.h> > #include <kern/thread.h> > +#include <kern/trace.h> > #include <kern/sched_prim.h> > #include <kern/slab.h> > #include <vm/vm_kern.h> > @@ -308,6 +309,7 @@ void stack_handoff( > /* > * Load the rest of the user state for the new thread > */ > + context_switch_trace(old, NULL, new); > switch_ktss(new->pcb); > > /* > @@ -332,6 +334,7 @@ void stack_handoff( > */ > void load_context(thread_t new) > { > + context_switch_trace(NULL, NULL, new); > switch_ktss(new->pcb); > Load_context(new); > } > @@ -382,6 +385,7 @@ thread_t switch_context( > /* > * Load the rest of the user state for the new thread > */ > + context_switch_trace(old, continuation, new); > switch_ktss(new->pcb); > return Switch_context(old, continuation, new); > } > diff --git a/i386/i386/trap.c b/i386/i386/trap.c > index 555d7498..17f7eb3c 100644 > --- a/i386/i386/trap.c > +++ b/i386/i386/trap.c > @@ -53,6 +53,7 @@ > #include <kern/printf.h> > #include <kern/thread.h> > #include <kern/task.h> > +#include <kern/trace.h> > #include <kern/sched.h> > #include <kern/sched_prim.h> > #include <kern/exception.h> > @@ -162,6 +163,8 @@ void kernel_trap(struct i386_saved_state *regs) > thread_t thread; > extern char _start[], etext[]; > > + trap_trace(TRACE_TRAP_KERNEL, regs); > + > type = regs->trapno; > code = regs->err; > thread = current_thread(); > @@ -366,6 +369,7 @@ int user_trap(struct i386_saved_state *regs) > unsigned long type; > thread_t thread = current_thread(); > > + trap_trace(TRACE_TRAP_USER, regs); > #ifdef __x86_64__ > assert(regs == &thread->pcb->iss); > #endif > @@ -592,6 +596,7 @@ int user_trap(struct i386_saved_state *regs) > void > i386_astintr(void) > { > + trap_trace(TRACE_TRAP_ASTINTR, &(current_thread()->pcb->iss)); > (void) splsched(); /* block interrupts to check reasons */ > #ifndef MACH_RING1 > int mycpu = cpu_number(); > diff --git a/include/mach/gnumach.defs b/include/mach/gnumach.defs > index 7ecf74d3..583bdc36 100644 > --- a/include/mach/gnumach.defs > +++ b/include/mach/gnumach.defs > @@ -215,3 +215,36 @@ routine vm_pages_phys( > simpleroutine thread_set_name( > thread : thread_t; > name : kernel_debug_name_t); > + > +/* > + * Start tracing kernel events, using a buffer of BUFSIZE bytes and > + * tracing only events set in KINDS. > + */ > +simpleroutine ktrace_start( > + host_priv : host_priv_t; > + bufsize : rpc_vm_size_t; > + kinds : rpc_long_integer_t); > + > +/* > + * Stop tracing kernel events. > + */ > +simpleroutine ktrace_stop( > + host_priv : host_priv_t); > + > +/* > + * Free the tracing buffer allocated in ktrace_start(). > + */ > +simpleroutine ktrace_free( > + host_priv : host_priv_t); > + > +/* > + * Read the content of the tracing buffer in BUF; additionally we get > + * a pointer to the first free byte after the tracing data in LAST_PTR > + * and some trace parameters in FLAGS. > + */ > +routine ktrace_collect( > + host_priv : host_priv_t; > + out buf : pointer_t; > + out last_ptr : vm_offset_t; > + out flags : natural_t); > + > diff --git a/include/mach/trace.h b/include/mach/trace.h > new file mode 100644 > index 00000000..3fccbbc7 > --- /dev/null > +++ b/include/mach/trace.h > @@ -0,0 +1,61 @@ > +/* > + * Copyright (C) 2023 Free Software Foundation > + * > + * This program is free software ; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation ; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY ; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with the program ; if not, write to the Free Software > + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. > + */ > + > +#ifndef _MACH_TRACE_H_ > +#define _MACH_TRACE_H_ > + > +#include <stdint.h> > +#include <mach/std_types.h> > + > +/* trace types as encoded in the ring buffer */ > +#define DEBUG_TRACE_INVALID 0 > +#define DEBUG_TRACE_SYSCALL 1 > +#define DEBUG_TRACE_SAMPLE 2 > +#define DEBUG_TRACE_TRAP 3 > +#define DEBUG_TRACE_CSW 4 > +#define DEBUG_TRACE_IRQ 5 > +#define DEBUG_TRACE_RPC 6 > +#define DEBUG_TRACE_DUMMY 15 > + > +/* trap events */ > +#define TRACE_TRAP_USER 0 > +#define TRACE_TRAP_KERNEL 1 > +#define TRACE_TRAP_ASTINTR 2 > + > +/* trace types bitmask, to be used in trace_start()*/ > +#define TRACE_TYPE_SYSCALL 0x01 > +#define TRACE_TYPE_TRAP 0x02 > +#define TRACE_TYPE_CSW 0x04 > +#define TRACE_TYPE_IRQ 0x08 > +#define TRACE_TYPE_RPC 0x10 > +#define TRACE_TYPE_ALL 0x1F > + > +/* Trace file header definitions */ > +#define TH_MAGIC "GMTR" > +#define TH_OVERRUN_MASK 0x0f > +#define TH_KERNEL64 0x10 > +#define TH_USER64 0x20 > + > +typedef struct { > + char magic[4]; > + uint32_t flags; > + uint32_t last; > + uint32_t ov_first; > +} trace_header_t; > + > +#endif /* _MACH_TRACE_H_ */ > diff --git a/ipc/ipc_kmsg.c b/ipc/ipc_kmsg.c > index 8bd645ff..8a153386 100644 > --- a/ipc/ipc_kmsg.c > +++ b/ipc/ipc_kmsg.c > @@ -509,7 +509,7 @@ ipc_kmsg_get( > ikm_free(kmsg); > return MACH_SEND_INVALID_DATA; > } > - > + rpc_trace(0, msg, size); > *kmsgp = kmsg; > return MACH_MSG_SUCCESS; > } > @@ -576,8 +576,10 @@ ipc_kmsg_put( > > if (copyoutmsg(&kmsg->ikm_header, msg, size)) > mr = MACH_RCV_INVALID_DATA; > - else > + else { > mr = MACH_MSG_SUCCESS; > + rpc_trace(1, msg, msg->msgh_size); > + } > > ikm_cache_free(kmsg); > > diff --git a/ipc/mach_msg.c b/ipc/mach_msg.c > index ff5e5b09..3a4298ff 100644 > --- a/ipc/mach_msg.c > +++ b/ipc/mach_msg.c > @@ -464,6 +464,7 @@ mach_msg_trap( > ikm_free(kmsg); > goto slow_get; > } > + rpc_trace(0, msg, send_size); > > fast_copyin: > /* > @@ -1194,6 +1195,7 @@ mach_msg_trap( > if (!ikm_cache_free_try(kmsg)) > goto slow_put; > > + rpc_trace(1, msg, msg->msgh_size); > thread_syscall_return(MACH_MSG_SUCCESS); > /*NOTREACHED*/ > return MACH_MSG_SUCCESS; /* help for the compiler */ > diff --git a/kern/exception.c b/kern/exception.c > index 15f29705..0e0260f0 100644 > --- a/kern/exception.c > +++ b/kern/exception.c > @@ -702,6 +702,7 @@ exception_raise( > /*NOTREACHED*/ > } > > + rpc_trace(1, receiver->ith_msg, receiver->ith_msg->msgh_size); > thread_syscall_return(MACH_MSG_SUCCESS); > /*NOTREACHED*/ > #ifndef __GNUC__ > diff --git a/kern/startup.c b/kern/startup.c > index 61751d35..ce8f0123 100644 > --- a/kern/startup.c > +++ b/kern/startup.c > @@ -47,6 +47,7 @@ > #include <kern/thread.h> > #include <kern/thread_swap.h> > #include <kern/timer.h> > +#include <kern/trace.h> > #include <kern/xpr.h> > #include <kern/bootstrap.h> > #include <kern/startup.h> > @@ -163,6 +164,7 @@ void setup_main(void) > compute_mach_factor(); > > gsync_setup (); > + trace_init(); > > /* > * Create a kernel thread to start the other kernel > diff --git a/kern/thread.h b/kern/thread.h > index 81d32924..5b5d2235 100644 > --- a/kern/thread.h > +++ b/kern/thread.h > @@ -240,6 +240,10 @@ struct thread { > #endif > > char name[THREAD_NAME_SIZE]; > + > +#if KERNEL_TRACE > + unsigned int syscall_num; /* for syscall tracing */ > +#endif > }; > > #include <kern/cpu_number.h> > diff --git a/kern/trace.c b/kern/trace.c > new file mode 100644 > index 00000000..69e6eaf0 > --- /dev/null > +++ b/kern/trace.c > @@ -0,0 +1,432 @@ > +/* > + * Copyright (C) 2024 Free Software Foundation > + * > + * This program is free software ; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation ; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY ; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with the program ; if not, write to the Free Software > + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. > + */ > + > +#include "kern/thread.h" > +#include <kern/gnumach.server.h> > +#include <kern/kalloc.h> > +#include <kern/printf.h> > +#include <kern/syscall_sw.h> > +#include <kern/trace.h> > +#include <mach/message.h> > +#include <mach/trace.h> > +#include <machine/locore.h> > +#include <string.h> > +#include "vm/vm_kern.h" > + > +/* Common information for each trace entry */ > +struct msg_trace_header { > + char task_name[TASK_NAME_SIZE]; > + thread_t thread; > + //int time; // to address thread/task reuse? we should trace also task > creation/destruction... > + unsigned int type : 8; > + unsigned int size : 24; // in bytes, can probably be reduced > + uint8_t data[]; > +} __attribute__((packed)); > + > +struct syscall_trace_data { > + unsigned int num : 7; // syscall id > + unsigned int direction : 1; // entry/exit > + unsigned int reserved : 24; > + uint8_t data[]; // args/retcode, depending on the syscall number and > direction > +} __attribute__((packed)); > + > +struct rpc_trace_data { > + unsigned int size : 16; // in bytes > + unsigned int direction : 1; // send-copyin/recv-copyout > + unsigned int reserved : 15; > + uint8_t data[]; > +} __attribute__((packed)); > + > +struct trap_trace_data { > + unsigned int origin : 4; // user/kernel > + unsigned int reserved : 28; > + struct i386_saved_state regs; > +} __attribute__((packed)); > + > +struct csw_trace_data { > + char old_task_name[TASK_NAME_SIZE]; > + thread_t old_thread; > + continuation_t continuation; > + char new_task_name[TASK_NAME_SIZE]; > + thread_t new_thread; > +} __attribute__((packed)); > + > +struct irq_trace_data { > + int irqn; > +} __attribute__((packed)); > + > +struct sample_trace_data { > + unsigned int linenum; > + char filename[]; > +} __attribute__((packed)); > + > +static uint8_t *msg_trace_buf = NULL; > +static vm_size_t msg_trace_size = 0; > +static uint8_t *msg_trace_ptr = NULL; > +static int msg_trace_overrun = 0; > + > +int syscall_trace; > +static int trace_rpc_enable; > +static int trace_trap_enable; > +static int trace_csw_enable; > +int interrupt_trace_enable; > + > +task_t syscall_trace_task = 0; > +unsigned int syscall_seqn = 0; > + > +/* Called to initialize kernel tracing */ > +void trace_init(void) > +{ > + msg_trace_buf = NULL; > + msg_trace_ptr = NULL; > + msg_trace_size = 0; > + msg_trace_overrun = 0; > + > + syscall_trace = 0; > + trace_rpc_enable = 0; > + trace_trap_enable = 0; > + trace_csw_enable = 0; > + interrupt_trace_enable = 0; > + > + /* Useful for debugging small tests and the bootstrap modules */ > + extern char* kernel_cmdline; // TODO: find a nice header > + if (strstr(kernel_cmdline, "tracestart") != NULL) > + { > + kern_return_t ret = ktrace_start(&realhost, 20*1024*1024, > TRACE_TYPE_ALL); > + if (ret == KERN_SUCCESS) > + printf("trace: started from command line\n"); > + else > + panic("Can't initialize trace buffer: error %d\n", ret); > + } > +} > + > +kern_return_t ktrace_start(const host_t host_priv, rpc_vm_size_t bufsize, > + rpc_long_integer_t kinds) > +{ > + if (host_priv == HOST_NULL) > + return KERN_INVALID_ARGUMENT; > + > + if (msg_trace_buf != NULL) > + return KERN_FAILURE; > + > + int kr = kmem_alloc(kernel_map, (vm_offset_t*) &msg_trace_buf, bufsize); > + if (kr != KERN_SUCCESS) > + return KERN_RESOURCE_SHORTAGE; > + > + if (msg_trace_buf == 0) > + return KERN_RESOURCE_SHORTAGE; > + > + msg_trace_ptr = msg_trace_buf; > + msg_trace_size = bufsize; > + msg_trace_overrun = 0; > + > + if (kinds & TRACE_TYPE_SYSCALL) > + syscall_trace = 1; > + if (kinds & TRACE_TYPE_RPC) > + trace_rpc_enable = 1; > + if (kinds & TRACE_TYPE_TRAP) > + trace_trap_enable = 1; > + if (kinds & TRACE_TYPE_CSW) > + trace_csw_enable = 1; > + if (kinds & TRACE_TYPE_IRQ) > + interrupt_trace_enable = 1; > + > + if (current_thread()) > + current_thread()->syscall_num = 25; // fake mach_msg() entry for > better trace > + > + return KERN_SUCCESS; > +} > + > +kern_return_t ktrace_stop(const host_t host_priv) > +{ > + if (host_priv == HOST_NULL) > + return KERN_INVALID_ARGUMENT; > + > + if (msg_trace_buf == NULL) > + return KERN_FAILURE; > + > + syscall_trace = 0; > + trace_rpc_enable = 0; > + trace_trap_enable = 0; > + trace_csw_enable = 0; > + interrupt_trace_enable = 0; > + return KERN_SUCCESS; > +} > + > +kern_return_t ktrace_free(const host_t host_priv) > +{ > + if (host_priv == HOST_NULL) > + return KERN_INVALID_ARGUMENT; > + > + if (msg_trace_buf == NULL) > + return KERN_FAILURE; > + > + kmem_free(kernel_map, (vm_offset_t) msg_trace_buf, msg_trace_size); > + msg_trace_buf = NULL; > + msg_trace_ptr = NULL; > + return KERN_SUCCESS; > +} > + > +kern_return_t ktrace_collect(const host_t host_priv, pointer_t *buf, > + mach_msg_type_number_t *bufsize, > + vm_size_t *last_ptr, natural_t *flags) > +{ > + if (host_priv == HOST_NULL) > + return KERN_INVALID_ARGUMENT; > + > + if (msg_trace_buf == NULL) > + return KERN_FAILURE; > + > + kern_return_t err; > + vm_map_copy_t copy; > + err = vm_map_copyin(kernel_map, (vm_offset_t) msg_trace_buf, > msg_trace_size, > + FALSE, ©); > + if (err == KERN_SUCCESS) > + { > + *buf = (pointer_t) copy; > + *bufsize = msg_trace_size; > + *last_ptr = msg_trace_ptr - msg_trace_buf; > + *flags = 0; > + if (msg_trace_overrun > 0) > + { > + if (msg_trace_overrun < TH_OVERRUN_MASK) > + *flags |= msg_trace_overrun & TH_OVERRUN_MASK; > + else > + *flags |= TH_OVERRUN_MASK; > + } > +#ifdef __LP64__ > + *flags |= TH_KERNEL64; > +#ifndef USER32 > + *flags |= TH_USER64; > +#endif /* USER32 */ > +#endif /* __x86_64__ */ > + } > + return err; > +} > + > +/* > + * Starts a new trace entry. It pre-allocates a given size and returns a > pointer > + * to a region that can be filled with the same amount of data. > + * In this way we can copy the data only once, e.g. when tracing > mach_msg() > + */ > +static void* trace_msg(unsigned int type, size_t size) > +{ > + const size_t msg_size = size + sizeof(struct msg_trace_header); > + int overrun = 0; > + uint8_t *tptr, *tptr_orig; > + struct msg_trace_header *cur; > + > + /* > + * Handle interrupitons from trap/irq in reserving the trace data > + * entry. > + * > + * TODO: ensure this is SMP-safe and maybe add cpuid to the trace > + * header. > + */ > + do { > + tptr = __atomic_load_n(&msg_trace_ptr, __ATOMIC_RELAXED); > + tptr_orig = tptr; > + > + if ((tptr - msg_trace_buf) > (msg_trace_size - msg_size)) > + { > + // circular buffer, reset to the beginning > + tptr = msg_trace_buf; > + overrun = 1; > + } > + > + tptr += msg_size; > + > + } while (! __atomic_compare_exchange_n(&msg_trace_ptr, &tptr_orig, > tptr, 0, > + __ATOMIC_RELAXED, > __ATOMIC_RELAXED)); > + > + if (overrun) > + msg_trace_overrun += 1; > + > + cur = (struct msg_trace_header*)(tptr - msg_size); > + memcpy(cur->task_name, current_task()->name, TASK_NAME_SIZE); > + cur->thread = current_thread(); > + cur->type = type; > + cur->size = msg_size; > + return cur->data; > +} > + > +#ifdef SYSCALL_TRACE_RECORD > +static void trace_syscall_entry(int num, long *args) > +{ > + const mach_trap_t *trap = &mach_trap_table[num]; > + size_t argsize = sizeof(long)*trap->mach_trap_arg_count; > + size_t size = sizeof(struct syscall_trace_data) + argsize; > + struct syscall_trace_data *data = trace_msg(DEBUG_TRACE_SYSCALL, size); > + data->direction = 0; > + data->num = num; > + memcpy(data->data, args, argsize); > +} > + > +static void trace_syscall_exit(int num, long ret) > +{ > + size_t argsize = sizeof(long); > + size_t size = sizeof(struct syscall_trace_data) + argsize; > + struct syscall_trace_data *data = trace_msg(DEBUG_TRACE_SYSCALL, size); > + data->direction = 1; > + data->num = num; > + memcpy(data->data, &ret, argsize); > +} > +#endif /* SYSCALL_TRACE_RECORD */ > + > +/* This needs to be called from the machine-specific entry point */ > +void syscall_trace_common(long sysnum, long argv[MAX_SYSCALL_ARGS]) > +{ > + > + if (syscall_trace == 0) > + goto out; > + > + current_thread()->syscall_num = sysnum; > + if (syscall_trace_task && syscall_trace_task != current_task()) > + goto out; > + > +#ifdef SYSCALL_TRACE_PRINT > + if (sysnum == 30) // don't track mach_print() > + goto out; > + > + syscall_seqn++; > + const mach_trap_t *trap = &mach_trap_table[sysnum]; > + printf("%08d:%s:0x%p:%s(", > + syscall_seqn, current_task()->name, current_thread(), > trap->mach_trap_name); > + for (int i=0; i<MAX_SYSCALL_ARGS; i++) > + { > + int value = argv[i]; > + if (trap->mach_trap_arg_count <= i) > + continue; > + if ((value > 1024) || (value < -1024)) > + printf("0x%08x", value); > + else > + printf("%d", value); > + if (i + 1 < trap->mach_trap_arg_count) > + printf(", "); > + } > + printf(")\n"); > +#else /* SYSCALL_TRACE_PRINT */ > + trace_syscall_entry(sysnum, argv); > +#endif > + out: > + return; > +} > + > +/* This needs to be called from the machine-specific entry point */ > +int syscall_trace_return(int val) > +{ > + if (syscall_trace == 0) > + return val; > + if (syscall_trace_task && syscall_trace_task != current_task()) > + return val; > +#ifdef SYSCALL_TRACE_PRINT > + if (current_thread()->syscall_num != 30) // don't track mach_print() > + { > + const mach_trap_t *trap = > &mach_trap_table[current_thread()->syscall_num]; > + printf("%08d:%s:0x%p: %s return(%08x)\n", > + syscall_seqn, current_task()->name, current_thread(), > + trap->mach_trap_name, val); > + } > +#else /* SYSCALL_TRACE_PRINT */ > + trace_syscall_exit(current_thread()->syscall_num, val); > +#endif > + > + return val; > +} > + > +void rpc_trace(int direction, void *msg, long msize) > +{ > + if (!trace_rpc_enable) > + return; > + > + size_t size = sizeof(struct rpc_trace_data); > + struct rpc_trace_data *data = trace_msg(DEBUG_TRACE_RPC, size + msize); > + data->direction = direction; > + data->size = msize; > + if (copyin(msg, data->data, msize)) > + memset(data->data, 0, msize); > +} > + > +/* This needs to be called from the machine-specific entry point */ > +void trap_trace(int origin, struct i386_saved_state *regs) > +{ > + if (!trace_trap_enable) > + return; > + > + size_t size = sizeof(struct trap_trace_data); > + struct trap_trace_data *data = trace_msg(DEBUG_TRACE_TRAP, size); > + data->origin = origin; > + memcpy(&data->regs, regs, sizeof(*regs)); > +} > + > +void context_switch_trace(thread_t old, continuation_t continuation, > thread_t new) > +{ > + if (!trace_csw_enable) > + return; > + > + size_t size = sizeof(struct csw_trace_data); > + struct csw_trace_data *data = trace_msg(DEBUG_TRACE_CSW, size); > + if (old) > + strncpy(data->old_task_name, old->task->name, TASK_NAME_SIZE); > + else > + data->old_task_name[0] = '\0'; > + data->old_thread = old; > + data->continuation = continuation; > + strncpy(data->new_task_name, new->task->name, TASK_NAME_SIZE); > + data->new_thread = new; > +} > + > +void interrupt_trace(int iunit, /* 'unit' number */ > + int old_ipl, /* old interrupt level */ > + const char *ret_addr, /* return address in interrupt > handler */ > + struct i386_interrupt_state *regs, /* saved > registers */ > + int irqn) > +{ > + size_t size = sizeof(struct irq_trace_data); > + struct irq_trace_data *data = trace_msg(DEBUG_TRACE_IRQ, size); > + data->irqn = irqn; > +} > + > +/* This trace type is useful during debug to create on-the-fly custom > + traces, e.g. to follow the evolution of a specific value or > + expression along other trace points */ > +void dummy_trace(void) > +{ > + struct dummy_data { > + int nthreads; > + }; > + size_t size = sizeof(struct dummy_data); > + struct dummy_data *data = trace_msg(DEBUG_TRACE_DUMMY, size); > + > + // as an example, we count the threads > + task_t task; > + thread_t thread; > + processor_set_t pset; > + data->nthreads = 0; > + if (queue_first(&all_psets) == 0) > + return; > + queue_iterate(&all_psets, pset, processor_set_t, all_psets) { > + if (queue_first(&pset->tasks) == 0) > + continue; > + queue_iterate(&pset->tasks, task, task_t, pset_tasks) { > + queue_iterate(&task->thread_list, thread, thread_t, thread_list) { > + data->nthreads++; > + } > + } > + } > +} > diff --git a/kern/trace.h b/kern/trace.h > new file mode 100644 > index 00000000..06d77e93 > --- /dev/null > +++ b/kern/trace.h > @@ -0,0 +1,42 @@ > +/* > + * Copyright (C) 2023 Free Software Foundation > + * > + * This program is free software ; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation ; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY ; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with the program ; if not, write to the Free Software > + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. > + */ > + > +#if KERNEL_TRACE > + > +#include <mach/trace.h> > +#include <kern/thread.h> > + > +/* syscall events */ > +#define MAX_SYSCALL_ARGS 12 > +//#define SYSCALL_TRACE_PRINT // useful for small tests > +#define SYSCALL_TRACE_RECORD > + > +void trace_init(void); > +void rpc_trace(int direction, void *msg, long size); > +void trap_trace(int origin, struct i386_saved_state *regs); > +void syscall_trace_common(long sysnum, long argv[MAX_SYSCALL_ARGS]); > +int syscall_trace_return(int val); > +void context_switch_trace(thread_t old, continuation_t continuation, > thread_t new); > + > +#else /* ! KERNEL_TRACE */ > + > +#define trace_init() > +#define trap_trace(origin, regs) > +#define context_switch_trace(old, cont, new) > + > +#endif /* KERNEL_TRACE */ > diff --git a/scripts/gnumach-gdb.py b/scripts/gnumach-gdb.py > new file mode 100644 > index 00000000..16728c06 > --- /dev/null > +++ b/scripts/gnumach-gdb.py > @@ -0,0 +1,63 @@ > + > +import logging > + > +import gdb > + > + > +log = logging.getLogger("gnumach-gdb") > +on64bit = gdb.selected_inferior().architecture().name() == "i386:x86-64" > +on64bituser = gdb.lookup_symbol("syscall64")[0] is not None > + > + > +class TraceCmd(gdb.Command): > + """gnumach-trace [FILE] > + > +Save the content of the kernel trace buffer to a file (default > ./trace.bin). > +""" > + > + def __init__(self): > + super().__init__("gnumach-trace", gdb.COMMAND_USER) > + > + def invoke(self, arg_string, from_tty): > + try: > + self._invoke(arg_string, from_tty) > + except Exception: > + log.exception(arg_string) > + > + def _invoke(self, arg_string, from_tty): > + args = gdb.string_to_argv(arg_string) > + if args in ['-h', '--help']: > + print(TraceCmd.__doc__) > + return > + outfile = "trace.bin" > + if len(args) >= 1: > + outfile = args[0] > + > + tbuf = gdb.lookup_static_symbol("msg_trace_buf") > + if tbuf is None: > + log.error("trace buffer not available.") > + return > + tbuf_base = gdb.lookup_static_symbol("msg_trace_buf").value() > + tbuf_top = gdb.lookup_static_symbol("msg_trace_ptr").value() > + tbuf_overrun = > gdb.lookup_static_symbol("msg_trace_overrun").value() > + tbuf_len = tbuf_top - tbuf_base > + > + print(f"Writing {tbuf_len} bytes to {outfile} (overrun > {tbuf_overrun})") > + proc = gdb.selected_inferior() > + with open(outfile, 'wb') as tf: > + # trace header (see mach/trace.h) > + tf.write(b"GMTR") > + flags = int(tbuf_overrun & 0xF) > + if on64bit: > + flags |= 0x10 > + if on64bituser: > + flags |= 0x20 > + tf.write(flags.to_bytes(4, byteorder='little')) > + tf.write(int(tbuf_len).to_bytes(4, byteorder='little')) > + tf.write(int(0).to_bytes(4, byteorder='little')) > + # trace payload > + tf.write(proc.read_memory(tbuf_base, tbuf_len)) > + > + > +logging.basicConfig(level=logging.INFO) > +TraceCmd() > diff --git a/tests/test-trace.c b/tests/test-trace.c > new file mode 100644 > index 00000000..d6bc8051 > --- /dev/null > +++ b/tests/test-trace.c > @@ -0,0 +1,58 @@ > +/* > + * Copyright (C) 2024 Free Software Foundation > + * > + * This program is free software ; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation ; either version 2 of the License, or > + * (at your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, > + * but WITHOUT ANY WARRANTY ; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > + * GNU General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with the program ; if not, write to the Free Software > + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. > + */ > + > +#include <syscalls.h> > +#include <testlib.h> > + > +#include <mach/trace.h> > +#include <gnumach.user.h> > +#include <mach.user.h> > + > +void test_trace(void) > +{ > + int err; > + err = ktrace_start(host_priv(), 40 * 1024 * 1024, TRACE_TYPE_ALL); > + ASSERT_RET(err, "ktrace_start"); > + > + err = mach_msg(NULL, MACH_MSG_OPTION_NONE, 0, 0, > + MACH_PORT_NULL, 0, MACH_PORT_NULL); > + ASSERT_RET(err, "syscall path with thread_syscall_return()"); > + > + err = ktrace_stop(host_priv()); > + ASSERT_RET(err, "ktrace_stop"); > + > + vm_offset_t buf, last_ptr; > + mach_msg_type_number_t bufsize; > + int flags; > + err = ktrace_collect(host_priv(), &buf, &bufsize, &last_ptr, &flags); > + ASSERT_RET(err, "ktrace_collect"); > + printf("size %lu last %lu flags %x\n", bufsize, last_ptr, flags); > + ASSERT(bufsize >= last_ptr, "ERROR overrun?"); > + > + err = vm_deallocate(mach_task_self(), buf, bufsize); > + ASSERT_RET(err, "vm_deallocate"); > + > + err = ktrace_free(host_priv()); > + ASSERT_RET(err, "ktrace_free"); > +} > + > +int main(int argc, char *argv[], int envc, char *envp[]) > +{ > + test_trace(); > + return 0; > +} > diff --git a/tests/user-qemu.mk b/tests/user-qemu.mk > index fd5ae1ab..c4df73f9 100644 > --- a/tests/user-qemu.mk > +++ b/tests/user-qemu.mk > @@ -204,7 +204,8 @@ USER_TESTS := \ > tests/test-syscalls \ > tests/test-machmsg \ > tests/test-task \ > - tests/test-threads > + tests/test-threads \ > + tests/test-trace > > USER_TESTS_CLEAN = $(subst tests/,clean-,$(USER_TESTS)) > > diff --git a/x86_64/debug_trace.S b/x86_64/debug_trace.S > index 7bed5ccc..97e48063 100644 > --- a/x86_64/debug_trace.S > +++ b/x86_64/debug_trace.S > @@ -30,19 +30,18 @@ > > .text > ENTRY(_debug_trace) > + ud2 // to test, do we need to keep it in asm? > pushf > cli > pushq %rax > pushq %rbx > - .byte 0x36 /* SS: bug in gas? */ > movl %ss:EXT(debug_trace_pos),%eax > movq S_ARG0,%rbx > - movq %rbx,%ss:EXT(debug_trace_buf)(,%eax,16) > - movl S_ARG1,%ebx > - movl %ebx,%ss:EXT(debug_trace_buf)+8(,%eax,16) > + movq %rbx,%ss:EXT(debug_trace_buf)(,%eax,8) > + movq S_ARG1,%rbx > + movq %rbx,%ss:EXT(debug_trace_buf)+8(,%eax,8) > incl %eax > andl $DEBUG_TRACE_LEN-1,%eax > - .byte 0x36 /* SS: bug in gas? */ > movl %eax,%ss:EXT(debug_trace_pos) > popq %rbx > popq %rax > diff --git a/x86_64/interrupt.S b/x86_64/interrupt.S > index 6fb77727..72a2fd0c 100644 > --- a/x86_64/interrupt.S > +++ b/x86_64/interrupt.S > @@ -115,6 +115,25 @@ ENTRY(interrupt) > shll $2,%eax /* irq * 4 */ > movl EXT(iunit)(%rax),%edi /* get device unit number as 1st > arg */ > > +#if KERNEL_TRACE > + testb $0xff,EXT(interrupt_trace_enable) > + jz 0f > + movq S_IRQ,S_ARG4 > + pushq %rax > + // save args for the real handler > + pushq S_ARG0 > + pushq S_ARG1 > + pushq S_ARG2 > + pushq S_ARG3 > + callq EXT(interrupt_trace) > + popq S_ARG3 > + popq S_ARG2 > + popq S_ARG1 > + popq S_ARG0 > + popq %rax > +0: > +#endif /* KERNEL_TRACE */ > + > shll $1,%eax /* irq * 8 */ > call *EXT(ivect)(%rax) /* call interrupt handler */ > > diff --git a/x86_64/locore.S b/x86_64/locore.S > index 8f39a677..9e9e5833 100644 > --- a/x86_64/locore.S > +++ b/x86_64/locore.S > @@ -747,6 +747,15 @@ ENTRY(thread_bootstrap_return) > */ > > ENTRY(thread_syscall_return) > +#if KERNEL_TRACE > + testb $0xff,EXT(syscall_trace) > + jz 1f > + /* first arg is already set in rdi, save it */ > + pushq %rdi > + callq syscall_trace_return > + popq %rdi > +1: > +#endif > movq S_ARG0,%rax /* get return value */ > movq %rsp,%rcx /* get kernel stack */ > or $(KERNEL_STACK_SIZE-1),%rcx > @@ -1275,16 +1284,41 @@ syscall_native: > jnz 0b /* loop for all arguments */ > > mach_call_call: > - > -#ifdef DEBUG > +#if KERNEL_TRACE > testb $0xff,EXT(syscall_trace) > jz 0f > - movq %rax,%rdi > - call EXT(syscall_trace_print) > - /* will return with syscallofs still (or again) in eax */ > + /* > + * We need to save register arguments, for the actual tracing > + * and to restore them before calling the real syscall. > + */ > + pushq %r9 > + pushq %r8 > + pushq %rcx > + pushq %rdx > + pushq %rsi > + pushq %rdi > + pushq %rax > + movq %rsp,%rdi > + callq EXT(syscall_trace_enter) > + popq %rax > + popq %rdi > + popq %rsi > + popq %rdx > + popq %rcx > + popq %r8 > + popq %r9 > + movq %rax,%r12 /* save sysnum for tracing return value */ > 0: > -#endif /* DEBUG */ > +#endif /* KERNEL_TRACE */ > call *EXT(mach_trap_table)+8(%rax) /* call procedure */ > +#if KERNEL_TRACE > + testb $0xff,EXT(syscall_trace) > + jz 1f > + movq %rax,%rdi > + callq syscall_trace_return > + /* will return with return value still (or again) in eax */ > +1: > +#endif /* KERNEL_TRACE */ > movq %rsp,%rcx /* get kernel stack */ > or $(KERNEL_STACK_SIZE-1),%rcx > movq -7-IKS_SIZE(%rcx),%rsp /* switch back to PCB stack */ > @@ -1460,7 +1494,45 @@ _syscall64_args_stack: > jnz 0b /* loop for all remaining > arguments */ > > _syscall64_call: > +#if KERNEL_TRACE > + testb $0xff,EXT(syscall_trace) > + jz 0f > + /* > + * We need to save register arguments, for the actual tracing > + * and to restore them before calling the real syscall. > + * We store all arguments and the syscall number in an array > + * on the stack; fortunately args > 6 are already there, so > + * push the rest and restore them before actually calling the > + * handler. > + */ > + pushq %r9 > + pushq %r8 > + pushq %rcx > + pushq %rdx > + pushq %rsi > + pushq %rdi > + pushq %rax > + movq %rsp,%rdi > + callq EXT(syscall_trace_enter) > + popq %rax > + popq %rdi > + popq %rsi > + popq %rdx > + popq %rcx > + popq %r8 > + popq %r9 > + movq %rax,%r12 /* save sysnum for tracing return value */ > +0: > +#endif /* KERNEL_TRACE */ > call *EXT(mach_trap_table)+8(%rax) /* call procedure */ > +#if KERNEL_TRACE > + testb $0xff,EXT(syscall_trace) > + jz 1f > + movq %rax,%rdi > + callq syscall_trace_return > + /* will return with return value still (or again) in eax */ > +1: > +#endif /* KERNEL_TRACE */ > > _syscall64_check_for_ast: > /* Check for ast. */ > -- > 2.39.2 > > >