use sk_convert_filter() to convert seccomp BPF into extended BPF 05-sim-long_jumps.c of libseccomp was used as micro-benchmark: seccomp_rule_add_exact(ctx,... seccomp_rule_add_exact(ctx,... rc = seccomp_load(ctx); for (i = 0; i < 10000000; i++) syscall(199, 100);
'short filter' has 2 rules 'large filter' has 200 rules 'short filter' performance is slightly better on x86_64,i386,arm32 'large filter' is much faster on x86_64 and i386 and shows no difference on arm32 --x86_64-- short filter old BPF: 2.7 sec 39.12% bench libc-2.15.so [.] syscall 8.10% bench [kernel.kallsyms] [k] sk_run_filter 6.31% bench [kernel.kallsyms] [k] system_call 5.59% bench [kernel.kallsyms] [k] trace_hardirqs_on_caller 4.37% bench [kernel.kallsyms] [k] trace_hardirqs_off_caller 3.70% bench [kernel.kallsyms] [k] __secure_computing 3.67% bench [kernel.kallsyms] [k] lock_is_held 3.03% bench [kernel.kallsyms] [k] seccomp_bpf_load new BPF: 2.58 sec 42.05% bench libc-2.15.so [.] syscall 6.91% bench [kernel.kallsyms] [k] system_call 6.25% bench [kernel.kallsyms] [k] trace_hardirqs_on_caller 6.07% bench [kernel.kallsyms] [k] __secure_computing 5.08% bench [kernel.kallsyms] [k] sk_run_filter_ext --arm32-- short filter old BPF: 4.0 sec 39.92% bench [kernel.kallsyms] [k] vector_swi 16.60% bench [kernel.kallsyms] [k] sk_run_filter 14.66% bench libc-2.17.so [.] syscall 5.42% bench [kernel.kallsyms] [k] seccomp_bpf_load 5.10% bench [kernel.kallsyms] [k] __secure_computing new BPF: 3.7 sec 35.93% bench [kernel.kallsyms] [k] vector_swi 21.89% bench libc-2.17.so [.] syscall 13.45% bench [kernel.kallsyms] [k] sk_run_filter_ext 6.25% bench [kernel.kallsyms] [k] __secure_computing 3.96% bench [kernel.kallsyms] [k] syscall_trace_exit --x86_64-- large filter old BPF: 8.6 seconds 73.38% bench [kernel.kallsyms] [k] sk_run_filter 10.70% bench libc-2.15.so [.] syscall 5.09% bench [kernel.kallsyms] [k] seccomp_bpf_load 1.97% bench [kernel.kallsyms] [k] system_call ext BPF: 5.7 seconds 66.20% bench [kernel.kallsyms] [k] sk_run_filter_ext 16.75% bench libc-2.15.so [.] syscall 3.31% bench [kernel.kallsyms] [k] system_call 2.88% bench [kernel.kallsyms] [k] __secure_computing --i386-- large filter old BPF: 5.4 sec ext BPF: 3.8 sec --arm32-- large filter old BPF: 13.5 sec 73.88% bench [kernel.kallsyms] [k] sk_run_filter 10.29% bench [kernel.kallsyms] [k] vector_swi 6.46% bench libc-2.17.so [.] syscall 2.94% bench [kernel.kallsyms] [k] seccomp_bpf_load 1.19% bench [kernel.kallsyms] [k] __secure_computing 0.87% bench [kernel.kallsyms] [k] sys_getuid new BPF: 13.5 sec 76.08% bench [kernel.kallsyms] [k] sk_run_filter_ext 10.98% bench [kernel.kallsyms] [k] vector_swi 5.87% bench libc-2.17.so [.] syscall 1.77% bench [kernel.kallsyms] [k] __secure_computing 0.93% bench [kernel.kallsyms] [k] sys_getuid BPF filters generated by seccomp are very branchy, so ext BPF performance is better than old BPF. Performance gains will be even higher when extended BPF JIT is committed. Signed-off-by: Alexei Starovoitov <a...@plumgrid.com> Reviewed-by: Kees Cook <keesc...@chromium.org> --- include/linux/seccomp.h | 1 - kernel/seccomp.c | 118 ++++++++++++++++++++++------------------------- net/core/filter.c | 5 -- 3 files changed, 56 insertions(+), 68 deletions(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index 6f19cfd1840e..4054b0994071 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -76,7 +76,6 @@ static inline int seccomp_mode(struct seccomp *s) #ifdef CONFIG_SECCOMP_FILTER extern void put_seccomp_filter(struct task_struct *tsk); extern void get_seccomp_filter(struct task_struct *tsk); -extern u32 seccomp_bpf_load(int off); #else /* CONFIG_SECCOMP_FILTER */ static inline void put_seccomp_filter(struct task_struct *tsk) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index b7a10048a32c..2a18f43acbd6 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -55,60 +55,31 @@ struct seccomp_filter { atomic_t usage; struct seccomp_filter *prev; unsigned short len; /* Instruction count */ - struct sock_filter insns[]; + struct sock_filter_ext insns[]; }; /* Limit any path through the tree to 256KB worth of instructions. */ #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter)) -/** - * get_u32 - returns a u32 offset into data - * @data: a unsigned 64 bit value - * @index: 0 or 1 to return the first or second 32-bits - * - * This inline exists to hide the length of unsigned long. If a 32-bit - * unsigned long is passed in, it will be extended and the top 32-bits will be - * 0. If it is a 64-bit unsigned long, then whatever data is resident will be - * properly returned. - * +/* * Endianness is explicitly ignored and left for BPF program authors to manage * as per the specific architecture. */ -static inline u32 get_u32(u64 data, int index) +static void populate_seccomp_data(struct seccomp_data *sd) { - return ((u32 *)&data)[index]; -} - -/* Helper for bpf_load below. */ -#define BPF_DATA(_name) offsetof(struct seccomp_data, _name) -/** - * bpf_load: checks and returns a pointer to the requested offset - * @off: offset into struct seccomp_data to load from - * - * Returns the requested 32-bits of data. - * seccomp_check_filter() should assure that @off is 32-bit aligned - * and not out of bounds. Failure to do so is a BUG. - */ -u32 seccomp_bpf_load(int off) -{ - struct pt_regs *regs = task_pt_regs(current); - if (off == BPF_DATA(nr)) - return syscall_get_nr(current, regs); - if (off == BPF_DATA(arch)) - return syscall_get_arch(current, regs); - if (off >= BPF_DATA(args[0]) && off < BPF_DATA(args[6])) { - unsigned long value; - int arg = (off - BPF_DATA(args[0])) / sizeof(u64); - int index = !!(off % sizeof(u64)); - syscall_get_arguments(current, regs, arg, 1, &value); - return get_u32(value, index); - } - if (off == BPF_DATA(instruction_pointer)) - return get_u32(KSTK_EIP(current), 0); - if (off == BPF_DATA(instruction_pointer) + sizeof(u32)) - return get_u32(KSTK_EIP(current), 1); - /* seccomp_check_filter should make this impossible. */ - BUG(); + struct task_struct *task = current; + struct pt_regs *regs = task_pt_regs(task); + + sd->nr = syscall_get_nr(task, regs); + sd->arch = syscall_get_arch(task, regs); + /* unroll syscall_get_args to help gcc on arm */ + syscall_get_arguments(task, regs, 0, 1, (unsigned long *)&sd->args[0]); + syscall_get_arguments(task, regs, 1, 1, (unsigned long *)&sd->args[1]); + syscall_get_arguments(task, regs, 2, 1, (unsigned long *)&sd->args[2]); + syscall_get_arguments(task, regs, 3, 1, (unsigned long *)&sd->args[3]); + syscall_get_arguments(task, regs, 4, 1, (unsigned long *)&sd->args[4]); + syscall_get_arguments(task, regs, 5, 1, (unsigned long *)&sd->args[5]); + sd->instruction_pointer = KSTK_EIP(task); } /** @@ -133,17 +104,17 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) switch (code) { case BPF_S_LD_W_ABS: - ftest->code = BPF_S_ANC_SECCOMP_LD_W; + ftest->code = BPF_LDX | BPF_W | BPF_ABS; /* 32-bit aligned and not out of bounds. */ if (k >= sizeof(struct seccomp_data) || k & 3) return -EINVAL; continue; case BPF_S_LD_W_LEN: - ftest->code = BPF_S_LD_IMM; + ftest->code = BPF_LD | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; case BPF_S_LDX_W_LEN: - ftest->code = BPF_S_LDX_IMM; + ftest->code = BPF_LDX | BPF_IMM; ftest->k = sizeof(struct seccomp_data); continue; /* Explicitly include allowed calls. */ @@ -185,6 +156,7 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) case BPF_S_JMP_JGT_X: case BPF_S_JMP_JSET_K: case BPF_S_JMP_JSET_X: + sk_decode_filter(ftest, ftest); continue; default: return -EINVAL; @@ -202,18 +174,21 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen) static u32 seccomp_run_filters(int syscall) { struct seccomp_filter *f; + struct seccomp_data sd; u32 ret = SECCOMP_RET_ALLOW; /* Ensure unexpected behavior doesn't result in failing open. */ if (WARN_ON(current->seccomp.filter == NULL)) return SECCOMP_RET_KILL; + populate_seccomp_data(&sd); + /* * All filters in the list are evaluated and the lowest BPF return * value always takes priority (ignoring the DATA). */ for (f = current->seccomp.filter; f; f = f->prev) { - u32 cur_ret = sk_run_filter(NULL, f->insns); + u32 cur_ret = sk_run_filter_ext(&sd, f->insns); if ((cur_ret & SECCOMP_RET_ACTION) < (ret & SECCOMP_RET_ACTION)) ret = cur_ret; } @@ -231,6 +206,8 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) struct seccomp_filter *filter; unsigned long fp_size = fprog->len * sizeof(struct sock_filter); unsigned long total_insns = fprog->len; + struct sock_filter *fp; + int new_len; long ret; if (fprog->len == 0 || fprog->len > BPF_MAXINSNS) @@ -252,28 +229,42 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) CAP_SYS_ADMIN) != 0) return -EACCES; - /* Allocate a new seccomp_filter */ - filter = kzalloc(sizeof(struct seccomp_filter) + fp_size, - GFP_KERNEL|__GFP_NOWARN); - if (!filter) + fp = kzalloc(fp_size, GFP_KERNEL|__GFP_NOWARN); + if (!fp) return -ENOMEM; - atomic_set(&filter->usage, 1); - filter->len = fprog->len; /* Copy the instructions from fprog. */ ret = -EFAULT; - if (copy_from_user(filter->insns, fprog->filter, fp_size)) - goto fail; + if (copy_from_user(fp, fprog->filter, fp_size)) + goto free_prog; /* Check and rewrite the fprog via the skb checker */ - ret = sk_chk_filter(filter->insns, filter->len); + ret = sk_chk_filter(fp, fprog->len); if (ret) - goto fail; + goto free_prog; /* Check and rewrite the fprog for seccomp use */ - ret = seccomp_check_filter(filter->insns, filter->len); + ret = seccomp_check_filter(fp, fprog->len); if (ret) - goto fail; + goto free_prog; + + /* convert 'sock_filter' insns to 'sock_filter_ext' insns */ + ret = sk_convert_filter(fp, fprog->len, NULL, &new_len); + if (ret) + goto free_prog; + + /* Allocate a new seccomp_filter */ + filter = kzalloc(sizeof(struct seccomp_filter) + + sizeof(struct sock_filter_ext) * new_len, + GFP_KERNEL|__GFP_NOWARN); + if (!filter) + goto free_prog; + + ret = sk_convert_filter(fp, fprog->len, filter->insns, &new_len); + if (ret) + goto free_filter; + atomic_set(&filter->usage, 1); + filter->len = new_len; /* * If there is an existing filter, make it the prev and don't drop its @@ -282,8 +273,11 @@ static long seccomp_attach_filter(struct sock_fprog *fprog) filter->prev = current->seccomp.filter; current->seccomp.filter = filter; return 0; -fail: + +free_filter: kfree(filter); +free_prog: + kfree(fp); return ret; } diff --git a/net/core/filter.c b/net/core/filter.c index 860f77874b14..9676dcd3905e 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -384,11 +384,6 @@ load_b: A = 0; continue; } -#ifdef CONFIG_SECCOMP_FILTER - case BPF_S_ANC_SECCOMP_LD_W: - A = seccomp_bpf_load(fentry->k); - continue; -#endif default: WARN_RATELIMIT(1, "Unknown code:%u jt:%u tf:%u k:%u\n", fentry->code, fentry->jt, -- 1.7.9.5 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/