On Tue, Apr 2, 2019 at 9:12 AM Arnaldo Carvalho de Melo <a...@kernel.org> wrote: > > From: Arnaldo Carvalho de Melo <a...@redhat.com> > > The previous method, copying to the BPF stack limited us in how many > bytes we could copy from strings, use a PERCPU_ARRAY map like devised by > the sysdig guys[1] to copy more bytes: > > Before: > > # trace --no-inherit -e openat touch `python -c "print "$s" 'a' * 2000"` > touch: cannot touch > 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa': > File name too long > openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3 > openat(AT_FDCWD, "/lib64/libc.so.6", O_RDONLY|O_CLOEXEC) = 3 > openat(AT_FDCWD, "/usr/lib/locale/locale-archive", O_RDONLY|O_CLOEXEC) = 3 > openat(AT_FDCWD, > "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", > O_CREAT|O_NOCTTY|O_NONBLOCK|O_WRONLY, S_IRUGO|S_IWUGO) = -1 ENAMETOOLONG > (File name too long) > openat(AT_FDCWD, "/usr/share/locale/locale.alias", O_RDONLY|O_CLOEXEC) = 3 > openat(AT_FDCWD, "/usr/share/locale/en_US.UTF-8/LC_MESSAGES/coreutils.mo", > O_RDONLY) = -1 ENOENT (No such file or directory) > openat(AT_FDCWD, "/usr/share/locale/en_US.utf8/LC_MESSAGES/coreutils.mo", > O_RDONLY) = -1 ENOENT (No such file or directory) > <SNIP some openat calls> > # > > After: > > [root@quaco acme]# trace --no-inherit -e openat touch `python -c "print > "$s" 'a' * 2000"` > <STRIP what is the same as in the 'before' part> > openat(AT_FDCWD, > "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", > O_CREAT|O_NOCTTY|O_NONBLOC) = -1 ENAMETOOLONG (File name too long) > <STRIP what is the same as in the 'before' part> > > If we leave something like 'perf trace -e string' to trace all syscalls > with a string, and then do some 'perf top', to get some annotation for > the augmented_raw_syscalls.o BPF program we get: > > │ → callq *ffffffffc45576d1 > ▒ > │ augmented_args->filename.size = > probe_read_str(&augmented_args->filename.value, > ▒ > 0.05 │ mov %eax,0x40(%r13) > > Looking with pahole, expanding types, asking for hex offsets and sizes, > and use of BTF type information to see what is at that 0x40 offset from > %r13: > > # pahole -F btf -C augmented_args_filename --expand_types --hex > /home/acme/git/perf/tools/perf/examples/bpf/augmented_raw_syscalls.o > struct augmented_args_filename { > struct syscall_enter_args { > long long unsigned int common_tp_fields; > /* 0 0x8 */ > long int syscall_nr; > /* 0x8 0x8 */ > long unsigned int args[6]; > /* 0x10 0x30 */ > } args; /* 0 0x40 */ > /* --- cacheline 1 boundary (64 bytes) --- */ > struct augmented_filename { > unsigned int size; > /* 0x40 0x4 */ > int reserved; > /* 0x44 0x4 */ > char value[4096]; > /* 0x48 0x1000 */ > } filename; /* 0x40 0x1008 */ > > /* size: 4168, cachelines: 66, members: 2 */ > /* last cacheline: 8 bytes */ > }; > # > > Then looking if PATH_MAX leaves some signature in the tests: > > │ if (augmented_args->filename.size < > sizeof(augmented_args->filename.value)) { > ▒ > │ cmp $0xfff,%rdi > > 0xfff == 4095 > sizeof(augmented_args->filename.value) == PATH_MAX == 4096 > > [1] https://sysdig.com/blog/the-art-of-writing-ebpf-programs-a-primer/ > > Cc: Adrian Hunter <adrian.hun...@intel.com> > Cc: Andrii Nakryiko <andr...@fb.com> > Cc: Daniel Borkmann <borkm...@iogearbox.net> > Cc: Gianluca Borello <g.bore...@gmail.com> > Cc: Jesper Dangaard Brouer <bro...@redhat.com> > Cc: Jiri Olsa <jo...@kernel.org> > Cc: Luis Cláudio Gonçalves <lclau...@redhat.com> > cc: Martin Lau <ka...@fb.com> > Cc: Namhyung Kim <namhy...@kernel.org> > Cc: Wang Nan <wangn...@huawei.com> > Cc: Yonghong Song <y...@fb.com> > Link: https://lkml.kernel.org/n/tip-76gce2d2ghzq537ubwhjk...@git.kernel.org > Signed-off-by: Arnaldo Carvalho de Melo <a...@redhat.com> > ---
Looks good to me. I've been using PERCPU_ARRAY as a sort of heap for many things that don't fit on BPF stack. Acked-by: Andrii Nakryiko <andr...@fb.com> > .../examples/bpf/augmented_raw_syscalls.c | 46 +++++++++++-------- > 1 file changed, 28 insertions(+), 18 deletions(-) > > diff --git a/tools/perf/examples/bpf/augmented_raw_syscalls.c > b/tools/perf/examples/bpf/augmented_raw_syscalls.c > index 9f8b31ad7a49..2422894a8194 100644 > --- a/tools/perf/examples/bpf/augmented_raw_syscalls.c > +++ b/tools/perf/examples/bpf/augmented_raw_syscalls.c > @@ -15,6 +15,7 @@ > */ > > #include <unistd.h> > +#include <linux/limits.h> > #include <pid_filter.h> > > /* bpf-output associated map */ > @@ -41,7 +42,7 @@ struct syscall_exit_args { > struct augmented_filename { > unsigned int size; > int reserved; > - char value[256]; > + char value[PATH_MAX]; > }; > > /* syscalls where the first arg is a string */ > @@ -119,23 +120,32 @@ struct augmented_filename { > > pid_filter(pids_filtered); > > +struct augmented_args_filename { > + struct syscall_enter_args args; > + struct augmented_filename filename; > +}; > + > +bpf_map(augmented_filename_map, PERCPU_ARRAY, int, struct > augmented_args_filename, 1); > + > SEC("raw_syscalls:sys_enter") > int sys_enter(struct syscall_enter_args *args) > { > - struct { > - struct syscall_enter_args args; > - struct augmented_filename filename; > - } augmented_args; > - struct syscall *syscall; > - unsigned int len = sizeof(augmented_args); > + struct augmented_args_filename *augmented_args; > + unsigned int len = sizeof(*augmented_args); > const void *filename_arg = NULL; > + struct syscall *syscall; > + int key = 0; > + > + augmented_args = bpf_map_lookup_elem(&augmented_filename_map, &key); > + if (augmented_args == NULL) > + return 1; > > if (pid_filter__has(&pids_filtered, getpid())) > return 0; > > - probe_read(&augmented_args.args, sizeof(augmented_args.args), args); > + probe_read(&augmented_args->args, sizeof(augmented_args->args), args); > > - syscall = bpf_map_lookup_elem(&syscalls, > &augmented_args.args.syscall_nr); > + syscall = bpf_map_lookup_elem(&syscalls, > &augmented_args->args.syscall_nr); > if (syscall == NULL || !syscall->enabled) > return 0; > /* > @@ -191,7 +201,7 @@ int sys_enter(struct syscall_enter_args *args) > * processor architecture, making the kernel part the same no matter > what > * kernel version or processor architecture it runs on. > */ > - switch (augmented_args.args.syscall_nr) { > + switch (augmented_args->args.syscall_nr) { > case SYS_ACCT: > case SYS_ADD_KEY: > case SYS_CHDIR: > @@ -263,20 +273,20 @@ int sys_enter(struct syscall_enter_args *args) > } > > if (filename_arg != NULL) { > - augmented_args.filename.reserved = 0; > - augmented_args.filename.size = > probe_read_str(&augmented_args.filename.value, > - > sizeof(augmented_args.filename.value), > + augmented_args->filename.reserved = 0; > + augmented_args->filename.size = > probe_read_str(&augmented_args->filename.value, > + > sizeof(augmented_args->filename.value), > filename_arg); > - if (augmented_args.filename.size < > sizeof(augmented_args.filename.value)) { > - len -= sizeof(augmented_args.filename.value) - > augmented_args.filename.size; > - len &= sizeof(augmented_args.filename.value) - 1; > + if (augmented_args->filename.size < > sizeof(augmented_args->filename.value)) { > + len -= sizeof(augmented_args->filename.value) - > augmented_args->filename.size; > + len &= sizeof(augmented_args->filename.value) - 1; > } > } else { > - len = sizeof(augmented_args.args); > + len = sizeof(augmented_args->args); > } > > /* If perf_event_output fails, return non-zero so that it gets > recorded unaugmented */ > - return perf_event_output(args, &__augmented_syscalls__, > BPF_F_CURRENT_CPU, &augmented_args, len); > + return perf_event_output(args, &__augmented_syscalls__, > BPF_F_CURRENT_CPU, augmented_args, len); > } > > SEC("raw_syscalls:sys_exit") > -- > 2.20.1 >