Den ons 16 jan. 2019 kl 00:50 skrev Daniel Borkmann <[email protected]>: > > On 01/15/2019 09:35 AM, Björn Töpel wrote: > > This commit adds eBPF JIT for RV64G. > > > > Codewise, it needs some refactoring. Currently there's a bit too much > > copy-and-paste going on, and I know some places where I could optimize > > the code generation a bit (mostly BPF_K type of instructions, dealing > > with immediates). > > Nice work! :) > > > From a features perspective, two things are missing: > > > > * tail calls > > * "far-branches", i.e. conditional branches that reach beyond 13b. > > > > The test_bpf.ko passes all tests. > > Did you also check test_verifier under jit with/without jit hardening > enabled? That one contains lots of runtime tests as well. Probably makes > sense to check under CONFIG_BPF_JIT_ALWAYS_ON to see what fails the JIT; > the test_verifier also contains various tail call tests targeted at JITs, > for example. >
Good point! I will do that. The only selftests/bpf program that I ran (and passed) was "test_progs". I'll make sure that the complete bpf selftests suite passes as well! > Nit: please definitely also add a MAINTAINERS entry with at least yourself > under BPF JIT section, and update Documentation/sysctl/net.txt with riscv64. > Ah! Yes, I'll fix that. > > Signed-off-by: Björn Töpel <[email protected]> > > --- > > arch/riscv/net/bpf_jit_comp.c | 1608 +++++++++++++++++++++++++++++++++ > > 1 file changed, 1608 insertions(+) > > > > diff --git a/arch/riscv/net/bpf_jit_comp.c b/arch/riscv/net/bpf_jit_comp.c > > index 7e359d3249ee..562d56eb8d23 100644 > > --- a/arch/riscv/net/bpf_jit_comp.c > > +++ b/arch/riscv/net/bpf_jit_comp.c > > @@ -1,4 +1,1612 @@ > > +// SPDX-License-Identifier: GPL-2.0 > > +/* > > + * BPF JIT compiler for RV64G > > + * > > + * Copyright(c) 2019 Björn Töpel <[email protected]> > > + * > > + */ > > + > > +#include <linux/bpf.h> > > +#include <linux/filter.h> > > +#include <asm/cacheflush.h> > > + > > +#define TMP_REG_0 (MAX_BPF_JIT_REG + 0) > > +#define TMP_REG_1 (MAX_BPF_JIT_REG + 1) > > Not used? > Correct! I'll get rid of them. > > +#define TAIL_CALL_REG (MAX_BPF_JIT_REG + 2) > > + > > +enum rv_register { > > + RV_REG_ZERO = 0, /* The constant value 0 */ > > + RV_REG_RA = 1, /* Return address */ > > + RV_REG_SP = 2, /* Stack pointer */ > > + RV_REG_GP = 3, /* Global pointer */ > > + RV_REG_TP = 4, /* Thread pointer */ > > + RV_REG_T0 = 5, /* Temporaries */ > > + RV_REG_T1 = 6, > > + RV_REG_T2 = 7, > > + RV_REG_FP = 8, > > + RV_REG_S1 = 9, /* Saved registers */ > > + RV_REG_A0 = 10, /* Function argument/return values */ > > + RV_REG_A1 = 11, /* Function arguments */ > > + RV_REG_A2 = 12, > > + RV_REG_A3 = 13, > > + RV_REG_A4 = 14, > > + RV_REG_A5 = 15, > > + RV_REG_A6 = 16, > > + RV_REG_A7 = 17, > > + RV_REG_S2 = 18, /* Saved registers */ > > + RV_REG_S3 = 19, > > + RV_REG_S4 = 20, > > + RV_REG_S5 = 21, > > + RV_REG_S6 = 22, > > + RV_REG_S7 = 23, > > + RV_REG_S8 = 24, > > + RV_REG_S9 = 25, > > + RV_REG_S10 = 26, > > + RV_REG_S11 = 27, > > + RV_REG_T3 = 28, /* Temporaries */ > > + RV_REG_T4 = 29, > > + RV_REG_T5 = 30, > > + RV_REG_T6 = 31, > > +}; > > + > > +struct rv_jit_context { > > + struct bpf_prog *prog; > > + u32 *insns; /* RV insns */ > > + int ninsns; > > + int epilogue_offset; > > + int *offset; /* BPF to RV */ > > + unsigned long seen_reg_bits; > > + int stack_size; > > +}; > > + > > +struct rv_jit_data { > > + struct bpf_binary_header *header; > > + u8 *image; > > + struct rv_jit_context ctx; > > +}; > > + > > +static u8 bpf_to_rv_reg(int bpf_reg, struct rv_jit_context *ctx) > > +{ > > This one can also be simplified by having a simple mapping as in > other JITs and then mark __set_bit(<reg>) in the small bpf_to_rv_reg() > helper. > Yeah, I agree. Much better. I'll take that route. > > + switch (bpf_reg) { > > + /* Return value */ > > + case BPF_REG_0: > > + __set_bit(RV_REG_A5, &ctx->seen_reg_bits); > > + return RV_REG_A5; > > + /* Function arguments */ > > + case BPF_REG_1: > > + __set_bit(RV_REG_A0, &ctx->seen_reg_bits); > > + return RV_REG_A0; > > + case BPF_REG_2: > > + __set_bit(RV_REG_A1, &ctx->seen_reg_bits); > > + return RV_REG_A1; > > + case BPF_REG_3: > > + __set_bit(RV_REG_A2, &ctx->seen_reg_bits); > > + return RV_REG_A2; > > + case BPF_REG_4: > > + __set_bit(RV_REG_A3, &ctx->seen_reg_bits); > > + return RV_REG_A3; > > + case BPF_REG_5: > > + __set_bit(RV_REG_A4, &ctx->seen_reg_bits); > > + return RV_REG_A4; > > + /* Callee saved registers */ > > + case BPF_REG_6: > > + __set_bit(RV_REG_S1, &ctx->seen_reg_bits); > > + return RV_REG_S1; > > + case BPF_REG_7: > > + __set_bit(RV_REG_S2, &ctx->seen_reg_bits); > > + return RV_REG_S2; > > + case BPF_REG_8: > > + __set_bit(RV_REG_S3, &ctx->seen_reg_bits); > > + return RV_REG_S3; > > + case BPF_REG_9: > > + __set_bit(RV_REG_S4, &ctx->seen_reg_bits); > > + return RV_REG_S4; > > + /* Stack read-only frame pointer to access stack */ > > + case BPF_REG_FP: > > + __set_bit(RV_REG_S5, &ctx->seen_reg_bits); > > + return RV_REG_S5; > > + /* Temporary register */ > > + case BPF_REG_AX: > > + __set_bit(RV_REG_T0, &ctx->seen_reg_bits); > > + return RV_REG_T0; > > + /* Tail call counter */ > > + case TAIL_CALL_REG: > > + __set_bit(RV_REG_S6, &ctx->seen_reg_bits); > > + return RV_REG_S6; > > + default: > > + return 0; > > + } > > +}; > [...] > > + /* tail call */ > > + case BPF_JMP | BPF_TAIL_CALL: > > + rd = bpf_to_rv_reg(TAIL_CALL_REG, ctx); > > + pr_err("bpf-jit: tail call not supported yet!\n"); > > + return -1; > > There are two options here, either fixed size prologue where you can > then jump over it in tail call case, or dynamic one which would make > it slower due to reg restore but shrinks image for non-tail calls. > So, it would be the latter then, which is pretty much like a more expensive (due to the tail call depth checks) function call. For the fixed prologue: how does, say x86, deal with BPF stack usage in the tail call case? If the caller doesn't use the bpf stack, but the callee does. From a quick glance in the code, the x86 prologue still uses aux->stack_depth. If the callee has a different stack usage that the caller, and then the callee does a function call, wouldn't this mess up the frame? (Yeah, obviously missing something! :-)) > > + /* function return */ > > + case BPF_JMP | BPF_EXIT: > > + if (i == ctx->prog->len - 1) > > + break; > > + > > + rvoff = epilogue_offset(ctx); > > + if (!is_21b_int(rvoff)) { > > + pr_err("bpf-jit: %d offset=%d not supported yet!\n", > > + __LINE__, rvoff); > > + return -1; > > + } > > + > > + emit(rv_jal(RV_REG_ZERO, rvoff >> 1), ctx); > > + break; > > + > > + /* dst = imm64 */ > > + case BPF_LD | BPF_IMM | BPF_DW: > > + { > > + struct bpf_insn insn1 = insn[1]; > > + u64 imm64; > > + > [...] > > + > > +static void build_prologue(struct rv_jit_context *ctx) > > +{ > > + int stack_adjust = 0, store_offset, bpf_stack_adjust; > > + > > + if (seen_reg(RV_REG_RA, ctx)) > > + stack_adjust += 8; > > + stack_adjust += 8; /* RV_REG_FP */ > > + if (seen_reg(RV_REG_S1, ctx)) > > + stack_adjust += 8; > > + if (seen_reg(RV_REG_S2, ctx)) > > + stack_adjust += 8; > > + if (seen_reg(RV_REG_S3, ctx)) > > + stack_adjust += 8; > > + if (seen_reg(RV_REG_S4, ctx)) > > + stack_adjust += 8; > > + if (seen_reg(RV_REG_S5, ctx)) > > + stack_adjust += 8; > > + if (seen_reg(RV_REG_S6, ctx)) > > + stack_adjust += 8; > > + > > + stack_adjust = round_up(stack_adjust, 16); > > + bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16); > > + stack_adjust += bpf_stack_adjust; > > + > > + store_offset = stack_adjust - 8; > > + > > + emit(rv_addi(RV_REG_SP, RV_REG_SP, -stack_adjust), ctx); > > + > > + if (seen_reg(RV_REG_RA, ctx)) { > > + emit(rv_sd(RV_REG_SP, store_offset, RV_REG_RA), ctx); > > + store_offset -= 8; > > + } > > + emit(rv_sd(RV_REG_SP, store_offset, RV_REG_FP), ctx); > > + store_offset -= 8; > > + if (seen_reg(RV_REG_S1, ctx)) { > > + emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S1), ctx); > > + store_offset -= 8; > > + } > > + if (seen_reg(RV_REG_S2, ctx)) { > > + emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S2), ctx); > > + store_offset -= 8; > > + } > > + if (seen_reg(RV_REG_S3, ctx)) { > > + emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S3), ctx); > > + store_offset -= 8; > > + } > > + if (seen_reg(RV_REG_S4, ctx)) { > > + emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S4), ctx); > > + store_offset -= 8; > > + } > > + if (seen_reg(RV_REG_S5, ctx)) { > > + emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S5), ctx); > > + store_offset -= 8; > > + } > > + if (seen_reg(RV_REG_S6, ctx)) { > > + emit(rv_sd(RV_REG_SP, store_offset, RV_REG_S6), ctx); > > + store_offset -= 8; > > + } > > + > > + emit(rv_addi(RV_REG_FP, RV_REG_SP, stack_adjust), ctx); > > + > > + if (bpf_stack_adjust) { > > + if (!seen_reg(RV_REG_S5, ctx)) > > + pr_warn("bpf-jit: not seen BPF_REG_FP, stack is %d\n", > > + bpf_stack_adjust); > > + emit(rv_addi(RV_REG_S5, RV_REG_SP, bpf_stack_adjust), ctx); > > + } > > + > > + ctx->stack_size = stack_adjust; > > +} > > + > > +static void build_epilogue(struct rv_jit_context *ctx) > > +{ > > + int stack_adjust = ctx->stack_size, store_offset = stack_adjust - 8; > > + > > + if (seen_reg(RV_REG_RA, ctx)) { > > + emit(rv_ld(RV_REG_RA, store_offset, RV_REG_SP), ctx); > > + store_offset -= 8; > > + } > > + emit(rv_ld(RV_REG_FP, store_offset, RV_REG_SP), ctx); > > + store_offset -= 8; > > + if (seen_reg(RV_REG_S1, ctx)) { > > + emit(rv_ld(RV_REG_S1, store_offset, RV_REG_SP), ctx); > > + store_offset -= 8; > > + } > > + if (seen_reg(RV_REG_S2, ctx)) { > > + emit(rv_ld(RV_REG_S2, store_offset, RV_REG_SP), ctx); > > + store_offset -= 8; > > + } > > + if (seen_reg(RV_REG_S3, ctx)) { > > + emit(rv_ld(RV_REG_S3, store_offset, RV_REG_SP), ctx); > > + store_offset -= 8; > > + } > > + if (seen_reg(RV_REG_S4, ctx)) { > > + emit(rv_ld(RV_REG_S4, store_offset, RV_REG_SP), ctx); > > + store_offset -= 8; > > + } > > + if (seen_reg(RV_REG_S5, ctx)) { > > + emit(rv_ld(RV_REG_S5, store_offset, RV_REG_SP), ctx); > > + store_offset -= 8; > > + } > > + if (seen_reg(RV_REG_S6, ctx)) { > > + emit(rv_ld(RV_REG_S6, store_offset, RV_REG_SP), ctx); > > + store_offset -= 8; > > + } > > + > > + emit(rv_addi(RV_REG_SP, RV_REG_SP, stack_adjust), ctx); > > + /* Set return value. */ > > + emit(rv_addi(RV_REG_A0, RV_REG_A5, 0), ctx); > > + emit(rv_jalr(RV_REG_ZERO, RV_REG_RA, 0), ctx); > > +} > > + > > +static int build_body(struct rv_jit_context *ctx, bool extra_pass) > > +{ > > + const struct bpf_prog *prog = ctx->prog; > > + int i; > > + > > + for (i = 0; i < prog->len; i++) { > > + const struct bpf_insn *insn = &prog->insnsi[i]; > > + int ret; > > + > > + ret = emit_insn(insn, ctx, extra_pass); > > + if (ret > 0) { > > + i++; > > + if (ctx->insns == NULL) > > + ctx->offset[i] = ctx->ninsns; > > + continue; > > + } > > + if (ctx->insns == NULL) > > + ctx->offset[i] = ctx->ninsns; > > + if (ret) > > + return ret; > > + } > > + return 0; > > +} > > + > > +static void bpf_fill_ill_insns(void *area, unsigned int size) > > +{ > > + memset(area, 0, size); > > Needs update as well? > No, bitpattern of all zeros is an illegal instruction, but a comment would be good! > > +} > > + > > +static void bpf_flush_icache(void *start, void *end) > > +{ > > + flush_icache_range((unsigned long)start, (unsigned long)end); > > +} > > + Thanks a lot for the comments, Daniel! I'll get back with a v2. Björn
