When JITing an eBPF program on x86_64, also JIT a list_func that calls the bpf_func in a loop. Since this is a direct call, it should perform better than indirect-calling bpf_func in a loop. Since getting the percpu 'redirect_info' variable is ugly and magic, pass a pointer into the list_func as a parameter instead of calculating it inside the loop. This is safe because BPF execution is not preemptible, so the percpu variable can't get moved while we're using it.
Signed-off-by: Edward Cree <ec...@solarflare.com> --- arch/x86/net/bpf_jit_comp.c | 164 ++++++++++++++++++++++++++++++++++++++++++++ include/linux/filter.h | 19 +++-- kernel/bpf/core.c | 18 +++-- net/core/dev.c | 5 +- 4 files changed, 195 insertions(+), 11 deletions(-) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 2580cd2e98b1..3e06dd79adda 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -1060,6 +1060,169 @@ struct x64_jit_data { struct jit_context ctx; }; +static void try_do_jit_list(struct bpf_prog *bpf_prog, + unsigned int (*bpf_func)(const void *ctx, + const struct bpf_insn *insn)) +{ + /* list_func takes three arguments: + * struct list_head *list [RDI] + * const struct bpf_insn *insn [RSI] + * const struct redirect_info *percpu_ri [RDX] + * + * Layout of struct bpf_work on x86_64: + * struct list_head { + * struct list_head *next; // 0x0 + * struct list_head *prev; // 0x8 + * } list; // 0x0 + * void *ctx; 0x10 + * struct redirect_info { + * u32 ifindex; // 0x18 + * u32 flags; // 0x1c + * struct bpf_map *map; // 0x20 + * struct bpf_map *map_to_flush; // 0x28 + * unsigned long map_owner; // 0x30 + * } ri; // 0x18 + * unsigned long ret; // 0x38 + * (total size 0x40 bytes) + * + * Desired function body: + * struct redirect_info *ri = percpu_ri; [R12] + * struct bpf_work *work; [RBP] + * + * list_for_each_entry(work, list, list) { + * work->ret = (*bpf_func)(work->ctx, insn); + * work->ri = *ri; + * } + * + * Assembly to emit: + * ; save CSRs + * push %rbx + * push %rbp + * push %r12 + * ; stash pointer to redirect_info + * mov %rdx,%r12 ; ri = percpu_ri + * ; start list + * mov %rdi,%rbp ; head = list + * next: ; while (true) { + * mov (%rbp),%rbx ; rbx = head->next + * cmp %rbx,%rdi ; if (rbx == list) + * je out ; break + * mov %rbx,%rbp ; head = rbx + * push %rdi ; save list + * push %rsi ; save insn (is still arg2) + * ; struct bpf_work *work = head (container_of, but list is first member) + * mov 0x10(%rbp),%rdi; arg1 = work->ctx + * callq bpf_func ; rax = (*bpf_func)(work->ctx, insn) + * mov %rax,0x38(%rbp); work->ret = rax + * ; work->ri = *ri + * mov (%r12),%rdx + * mov %rdx,0x18(%rbp) + * mov 0x8(%r12),%rdx + * mov %rdx,0x20(%rbp) + * mov 0x10(%r12),%rdx + * mov %rdx,0x28(%rbp) + * mov 0x18(%r12),%rdx + * mov %rdx,0x30(%rbp) + * pop %rsi ; restore insn + * pop %rdi ; restore list + * jmp next ; } + * out: + * ; restore CSRs and return + * pop %r12 + * pop %rbp + * pop %rbx + * retq + */ + u8 *image, *prog, *from_to_out, *next; + struct bpf_binary_header *header; + int off, cnt = 0; + s64 jmp_offset; + + /* Prog should be 81 bytes; let's round up to 128 */ + header = bpf_jit_binary_alloc(128, &image, 1, jit_fill_hole); + prog = image; + + /* push rbx */ + EMIT1(0x53); + /* push rbp */ + EMIT1(0x55); + /* push %r12 */ + EMIT2(0x41, 0x54); + /* mov %rdx,%r12 */ + EMIT3(0x49, 0x89, 0xd4); + /* mov %rdi,%rbp */ + EMIT3(0x48, 0x89, 0xfd); + next = prog; + /* mov 0x0(%rbp),%rbx */ + EMIT4(0x48, 0x8b, 0x5d, 0x00); + /* cmp %rbx,%rdi */ + EMIT3(0x48, 0x39, 0xdf); + /* je out */ + EMIT2(X86_JE, 0); + from_to_out = prog; /* record . to patch this jump later */ + /* mov %rbx,%rbp */ + EMIT3(0x48, 0x89, 0xdd); + /* push %rdi */ + EMIT1(0x57); + /* push %rsi */ + EMIT1(0x56); + /* mov 0x10(%rbp),%rdi */ + EMIT4(0x48, 0x8b, 0x7d, 0x10); + /* e8 callq bpf_func */ + jmp_offset = (u8 *)bpf_func - (prog + 5); + if (!is_simm32(jmp_offset)) { + pr_err("call out of range to BPF func %p from list image %p\n", + bpf_func, image); + goto fail; + } + EMIT1_off32(0xE8, jmp_offset); + /* mov %rax,0x38(%rbp) */ + EMIT4(0x48, 0x89, 0x45, 0x38); + /* mov (%r12),%rdx */ + EMIT4(0x49, 0x8b, 0x14, 0x24); + /* mov %rdx,0x18(%rbp) */ + EMIT4(0x48, 0x89, 0x55, 0x18); + for (off = 0x8; off < 0x20; off += 0x8) { + /* mov off(%r12),%rdx */ + EMIT4(0x49, 0x8b, 0x54, 0x24); + EMIT1(off); + /* mov %rdx,0x18+off(%rbp) */ + EMIT4(0x48, 0x89, 0x55, 0x18 + off); + } + /* pop %rsi */ + EMIT1(0x5e); + /* pop %rdi */ + EMIT1(0x5f); + /* jmp next */ + jmp_offset = next - (prog + 2); + if (WARN_ON(!is_imm8(jmp_offset))) /* can't happen */ + goto fail; + EMIT2(0xeb, jmp_offset); + /* out: */ + jmp_offset = prog - from_to_out; + if (WARN_ON(!is_imm8(jmp_offset))) /* can't happen */ + goto fail; + from_to_out[-1] = jmp_offset; + /* pop %r12 */ + EMIT2(0x41, 0x5c); + /* pop %rbp */ + EMIT1(0x5d); + /* pop %rbx */ + EMIT1(0x5b); + /* retq */ + EMIT1(0xc3); + /* If we were wrong about how much space we needed, scream and shout */ + WARN_ON(cnt != 81); + if (bpf_jit_enable > 1) + bpf_jit_dump(0, cnt, 0, image); + bpf_jit_binary_lock_ro(header); + bpf_prog->list_func = (void *)image; + bpf_prog->jited_list = 1; + return; +fail: + bpf_jit_binary_free(header); +} + struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) { struct bpf_binary_header *header = NULL; @@ -1176,6 +1339,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) prog->bpf_func = (void *)image; prog->jited = 1; prog->jited_len = proglen; + try_do_jit_list(prog, prog->bpf_func); } else { prog = orig_prog; } diff --git a/include/linux/filter.h b/include/linux/filter.h index 7d813034e286..ad1e75bf0991 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -517,7 +517,8 @@ struct bpf_prog { const struct bpf_insn *insn); /* Takes a list of struct bpf_work */ void (*list_func)(struct list_head *list, - const struct bpf_insn *insn); + const struct bpf_insn *insn, + const struct redirect_info *percpu_ri); /* Instructions for interpreter */ union { struct sock_filter insns[0]; @@ -532,7 +533,7 @@ struct sk_filter { }; #define BPF_PROG_RUN(filter, ctx) (*(filter)->bpf_func)(ctx, (filter)->insnsi) -#define BPF_LIST_PROG_RUN(filter, list) (*(filter)->list_func)(list, (filter)->insnsi) +#define BPF_LIST_PROG_RUN(filter, list, percpu) (*(filter)->list_func)(list, (filter)->insnsi, percpu) #define BPF_SKB_CB_LEN QDISC_CB_PRIV_LEN @@ -638,10 +639,11 @@ static __always_inline u32 bpf_prog_run_xdp(const struct bpf_prog *prog, } static __always_inline void bpf_list_prog_run_xdp(const struct bpf_prog *prog, - struct list_head *list) + struct list_head *list, + const struct redirect_info *percpu_ri) { /* Caller must hold rcu_read_lock(), as per bpf_prog_run_xdp(). */ - BPF_LIST_PROG_RUN(prog, list); + BPF_LIST_PROG_RUN(prog, list, percpu_ri); } static inline u32 bpf_prog_insn_size(const struct bpf_prog *prog) @@ -756,6 +758,15 @@ bpf_jit_binary_hdr(const struct bpf_prog *fp) return (void *)addr; } +static inline struct bpf_binary_header * +bpf_list_jit_binary_hdr(const struct bpf_prog *fp) +{ + unsigned long real_start = (unsigned long)fp->list_func; + unsigned long addr = real_start & PAGE_MASK; + + return (void *)addr; +} + #ifdef CONFIG_ARCH_HAS_SET_MEMORY static inline int bpf_prog_check_pages_ro_single(const struct bpf_prog *fp) { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c35da826cc3b..028be88c4af8 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -621,15 +621,22 @@ void bpf_jit_binary_free(struct bpf_binary_header *hdr) */ void __weak bpf_jit_free(struct bpf_prog *fp) { - if (fp->jited) { - struct bpf_binary_header *hdr = bpf_jit_binary_hdr(fp); + struct bpf_binary_header *hdr; + if (fp->jited) { + hdr = bpf_jit_binary_hdr(fp); bpf_jit_binary_unlock_ro(hdr); bpf_jit_binary_free(hdr); WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); } + if (fp->jited_list) { + hdr = bpf_list_jit_binary_hdr(fp); + bpf_jit_binary_unlock_ro(hdr); + bpf_jit_binary_free(hdr); + } + bpf_prog_unlock_free(fp); } @@ -1358,13 +1365,13 @@ static u64 PROG_NAME_ARGS(stack_size)(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5, \ #define LIST_PROG_NAME(stack_size) __bpf_list_prog_run##stack_size #define DEFINE_BPF_LIST_PROG_RUN(stack_size) \ -static void LIST_PROG_NAME(stack_size)(struct list_head *list, const struct bpf_insn *insn) \ +static void LIST_PROG_NAME(stack_size)(struct list_head *list, const struct bpf_insn *insn, const struct redirect_info *percpu_ri) \ { \ struct bpf_work *work; \ \ list_for_each_entry(work, list, list) { \ work->ret = PROG_NAME(stack_size)(work->ctx, insn); \ - work->ri = *this_cpu_ptr(&redirect_info); \ + work->ri = *percpu_ri; \ } \ } @@ -1398,7 +1405,8 @@ EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) #undef PROG_NAME_LIST #define PROG_NAME_LIST(stack_size) LIST_PROG_NAME(stack_size), static void (*list_interpreters[])(struct list_head *list, - const struct bpf_insn *insn) = { + const struct bpf_insn *insn, + const struct redirect_info *percpu_ri) = { EVAL6(PROG_NAME_LIST, 32, 64, 96, 128, 160, 192) EVAL6(PROG_NAME_LIST, 224, 256, 288, 320, 352, 384) EVAL4(PROG_NAME_LIST, 416, 448, 480, 512) diff --git a/net/core/dev.c b/net/core/dev.c index 746112c22afd..7c1879045ef8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4214,6 +4214,7 @@ static void do_xdp_list_generic(struct bpf_prog *xdp_prog, struct sk_buff_head *list, struct sk_buff_head *pass_list) { + const struct redirect_info *percpu_ri = this_cpu_ptr(&redirect_info); struct xdp_work (*xwa)[NAPI_POLL_WEIGHT], *xw; struct bpf_work *bw; struct sk_buff *skb; @@ -4249,11 +4250,11 @@ static void do_xdp_list_generic(struct bpf_prog *xdp_prog, if (xdp_prog->list_func && (xdp_prog->jited_list || !xdp_prog->jited)) - bpf_list_prog_run_xdp(xdp_prog, &xdp_list); + bpf_list_prog_run_xdp(xdp_prog, &xdp_list, percpu_ri); else list_for_each_entry(bw, &xdp_list, list) { bw->ret = bpf_prog_run_xdp(xdp_prog, bw->ctx); - bw->ri = *this_cpu_ptr(&redirect_info); + bw->ri = *percpu_ri; } for (i = 0; i < n; i++) {