[RFC PATCH v2 tip 2/7] Extended BPF JIT for x86-64

Alexei Starovoitov Wed, 05 Feb 2014 16:14:04 -0800

Just-In-Time compiler that maps 64-bit BPF instructions to x86-64 instructions.


Most BPF instructions have one to one mapping.

Every BPF register maps to one x86-64 register:
R0 -> rax
R1 -> rdi
R2 -> rsi
R3 -> rdx
R4 -> rcx
R5 -> r8
R6 -> rbx
R7 -> r13
R8 -> r14
R9 -> r15
FP -> rbp

BPF calling convention is defined as:
R0 - return value from in-kernel function
R1-R5 - arguments from BPF program to in-kernel function
R6-R9 - callee saved registers that in-kernel function will preserve
R10 - read-only frame pointer to access stack
so BPF calling convention maps directly to x86-64 calling convention.

Allowing zero-overhead calls between BPF filter and safe kernel functions

Signed-off-by: Alexei Starovoitov <a...@plumgrid.com>
---
 arch/x86/Kconfig              |    1 +
 arch/x86/net/Makefile         |    1 +
 arch/x86/net/bpf64_jit_comp.c |  625 +++++++++++++++++++++++++++++++++++++++++
 arch/x86/net/bpf_jit_comp.c   |   23 +-
 arch/x86/net/bpf_jit_comp.h   |   35 +++
 5 files changed, 665 insertions(+), 20 deletions(-)
 create mode 100644 arch/x86/net/bpf64_jit_comp.c
 create mode 100644 arch/x86/net/bpf_jit_comp.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index fe55897..ff97d4b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -94,6 +94,7 @@ config X86
        select GENERIC_CLOCKEVENTS_MIN_ADJUST
        select IRQ_FORCED_THREADING
        select HAVE_BPF_JIT if X86_64
+       select HAVE_BPF64_JIT if X86_64
        select HAVE_ARCH_TRANSPARENT_HUGEPAGE
        select CLKEVT_I8253
        select ARCH_HAVE_NMI_SAFE_CMPXCHG
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
index 90568c3..c3bb7d5 100644
--- a/arch/x86/net/Makefile
+++ b/arch/x86/net/Makefile
@@ -2,3 +2,4 @@
 # Arch-specific network modules
 #
 obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
+obj-$(CONFIG_BPF64_JIT) += bpf64_jit_comp.o
diff --git a/arch/x86/net/bpf64_jit_comp.c b/arch/x86/net/bpf64_jit_comp.c
new file mode 100644
index 0000000..5f7c331
--- /dev/null
+++ b/arch/x86/net/bpf64_jit_comp.c
@@ -0,0 +1,625 @@
+/*
+ * Copyright (c) 2011-2013 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf_jit.h>
+#include <linux/moduleloader.h>
+#include "bpf_jit_comp.h"
+
+static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
+{
+       if (len == 1)
+               *ptr = bytes;
+       else if (len == 2)
+               *(u16 *)ptr = bytes;
+       else
+               *(u32 *)ptr = bytes;
+       return ptr + len;
+}
+
+#define EMIT(bytes, len) (prog = emit_code(prog, (bytes), (len)))
+
+#define EMIT1(b1)              EMIT(b1, 1)
+#define EMIT2(b1, b2)          EMIT((b1) + ((b2) << 8), 2)
+#define EMIT3(b1, b2, b3)      EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
+#define EMIT4(b1, b2, b3, b4)  EMIT((b1) + ((b2) << 8) + ((b3) << 16) + \
+                                    ((b4) << 24), 4)
+/* imm32 is sign extended by cpu */
+#define EMIT1_off32(b1, off) \
+       do {EMIT1(b1); EMIT(off, 4); } while (0)
+#define EMIT2_off32(b1, b2, off) \
+       do {EMIT2(b1, b2); EMIT(off, 4); } while (0)
+#define EMIT3_off32(b1, b2, b3, off) \
+       do {EMIT3(b1, b2, b3); EMIT(off, 4); } while (0)
+#define EMIT4_off32(b1, b2, b3, b4, off) \
+       do {EMIT4(b1, b2, b3, b4); EMIT(off, 4); } while (0)
+
+/* mov A, X */
+#define EMIT_mov(A, X) \
+       EMIT3(add_2mod(0x48, A, X), 0x89, add_2reg(0xC0, A, X))
+
+#define X86_JAE 0x73
+#define X86_JE  0x74
+#define X86_JNE 0x75
+#define X86_JA  0x77
+#define X86_JGE 0x7D
+#define X86_JG  0x7F
+
+static inline bool is_imm8(__s32 value)
+{
+       return value <= 127 && value >= -128;
+}
+
+static inline bool is_simm32(__s64 value)
+{
+       return value == (__s64)(__s32)value;
+}
+
+static int bpf_size_to_x86_bytes(int bpf_size)
+{
+       if (bpf_size == BPF_W)
+               return 4;
+       else if (bpf_size == BPF_H)
+               return 2;
+       else if (bpf_size == BPF_B)
+               return 1;
+       else if (bpf_size == BPF_DW)
+               return 4; /* imm32 */
+       else
+               return 0;
+}
+
+#define AUX_REG 32
+
+/* avoid x86-64 R12 which if used as base address in memory access
+ * always needs an extra byte for index */
+static const int reg2hex[] = {
+       [R0] = 0, /* rax */
+       [R1] = 7, /* rdi */
+       [R2] = 6, /* rsi */
+       [R3] = 2, /* rdx */
+       [R4] = 1, /* rcx */
+       [R5] = 0, /* r8 */
+       [R6] = 3, /* rbx callee saved */
+       [R7] = 5, /* r13 callee saved */
+       [R8] = 6, /* r14 callee saved */
+       [R9] = 7, /* r15 callee saved */
+       [__fp__] = 5, /* rbp readonly */
+       [AUX_REG] = 1, /* r9 temp register */
+};
+
+/* is_ereg() == true if r8 <= reg <= r15,
+ * rax,rcx,...,rbp don't need extra byte of encoding */
+static inline bool is_ereg(u32 reg)
+{
+       if (reg == R5 || (reg >= R7 && reg <= R9) || reg == AUX_REG)
+               return true;
+       else
+               return false;
+}
+
+static inline u8 add_1mod(u8 byte, u32 reg)
+{
+       if (is_ereg(reg))
+               byte |= 1;
+       return byte;
+}
+static inline u8 add_2mod(u8 byte, u32 r1, u32 r2)
+{
+       if (is_ereg(r1))
+               byte |= 1;
+       if (is_ereg(r2))
+               byte |= 4;
+       return byte;
+}
+
+static inline u8 add_1reg(u8 byte, u32 a_reg)
+{
+       return byte + reg2hex[a_reg];
+}
+static inline u8 add_2reg(u8 byte, u32 a_reg, u32 x_reg)
+{
+       return byte + reg2hex[a_reg] + (reg2hex[x_reg] << 3);
+}
+
+static u8 *select_bpf_func(struct bpf_program *prog, int id)
+{
+       if (id <= 0 || id >= prog->strtab_size)
+               return NULL;
+       return prog->cb->jit_select_func(prog->strtab, id);
+}
+
+static int do_jit(struct bpf_program *bpf_prog, int *addrs, u8 *image,
+                 int oldproglen)
+{
+       struct bpf_insn *insn = bpf_prog->insns;
+       int insn_cnt = bpf_prog->insn_cnt;
+       u8 temp[64];
+       int i;
+       int proglen = 0;
+       u8 *prog = temp;
+       int stacksize = 512;
+
+       EMIT1(0x55); /* push rbp */
+       EMIT3(0x48, 0x89, 0xE5); /* mov rbp,rsp */
+
+       /* sub rsp, stacksize */
+       EMIT3_off32(0x48, 0x81, 0xEC, stacksize);
+       /* mov qword ptr [rbp-X],rbx */
+       EMIT3_off32(0x48, 0x89, 0x9D, -stacksize);
+       /* mov qword ptr [rbp-X],r13 */
+       EMIT3_off32(0x4C, 0x89, 0xAD, -stacksize + 8);
+       /* mov qword ptr [rbp-X],r14 */
+       EMIT3_off32(0x4C, 0x89, 0xB5, -stacksize + 16);
+       /* mov qword ptr [rbp-X],r15 */
+       EMIT3_off32(0x4C, 0x89, 0xBD, -stacksize + 24);
+
+       for (i = 0; i < insn_cnt; i++, insn++) {
+               const __s32 K = insn->imm;
+               __u32 a_reg = insn->a_reg;
+               __u32 x_reg = insn->x_reg;
+               u8 b1 = 0, b2 = 0, b3 = 0;
+               u8 jmp_cond;
+               __s64 jmp_offset;
+               int ilen;
+               u8 *func;
+
+               switch (insn->code) {
+                       /* ALU */
+               case BPF_ALU | BPF_ADD | BPF_X:
+               case BPF_ALU | BPF_SUB | BPF_X:
+               case BPF_ALU | BPF_AND | BPF_X:
+               case BPF_ALU | BPF_OR | BPF_X:
+               case BPF_ALU | BPF_XOR | BPF_X:
+                       b1 = 0x48;
+                       b3 = 0xC0;
+                       switch (BPF_OP(insn->code)) {
+                       case BPF_ADD: b2 = 0x01; break;
+                       case BPF_SUB: b2 = 0x29; break;
+                       case BPF_AND: b2 = 0x21; break;
+                       case BPF_OR: b2 = 0x09; break;
+                       case BPF_XOR: b2 = 0x31; break;
+                       }
+                       EMIT3(add_2mod(b1, a_reg, x_reg), b2,
+                             add_2reg(b3, a_reg, x_reg));
+                       break;
+
+                       /* mov A, X */
+               case BPF_ALU | BPF_MOV | BPF_X:
+                       EMIT_mov(a_reg, x_reg);
+                       break;
+
+                       /* neg A */
+               case BPF_ALU | BPF_NEG | BPF_X:
+                       EMIT3(add_1mod(0x48, a_reg), 0xF7,
+                             add_1reg(0xD8, a_reg));
+                       break;
+
+               case BPF_ALU | BPF_ADD | BPF_K:
+               case BPF_ALU | BPF_SUB | BPF_K:
+               case BPF_ALU | BPF_AND | BPF_K:
+               case BPF_ALU | BPF_OR | BPF_K:
+                       b1 = add_1mod(0x48, a_reg);
+
+                       switch (BPF_OP(insn->code)) {
+                       case BPF_ADD: b3 = 0xC0; break;
+                       case BPF_SUB: b3 = 0xE8; break;
+                       case BPF_AND: b3 = 0xE0; break;
+                       case BPF_OR: b3 = 0xC8; break;
+                       }
+
+                       if (is_imm8(K))
+                               EMIT4(b1, 0x83, add_1reg(b3, a_reg), K);
+                       else
+                               EMIT3_off32(b1, 0x81, add_1reg(b3, a_reg), K);
+                       break;
+
+               case BPF_ALU | BPF_MOV | BPF_K:
+                       /* 'mov rax, imm32' sign extends imm32.
+                        * possible optimization: if imm32 is positive,
+                        * use 'mov eax, imm32' (which zero-extends imm32)
+                        * to save 2 bytes */
+                       b1 = add_1mod(0x48, a_reg);
+                       b2 = 0xC7;
+                       b3 = 0xC0;
+                       EMIT3_off32(b1, b2, add_1reg(b3, a_reg), K);
+                       break;
+
+                       /* A %= X
+                        * A /= X */
+               case BPF_ALU | BPF_MOD | BPF_X:
+               case BPF_ALU | BPF_DIV | BPF_X:
+                       EMIT1(0x50); /* push rax */
+                       EMIT1(0x52); /* push rdx */
+
+                       /* mov r9, X */
+                       EMIT_mov(AUX_REG, x_reg);
+
+                       /* mov rax, A */
+                       EMIT_mov(R0, a_reg);
+
+                       /* xor rdx, rdx */
+                       EMIT3(0x48, 0x31, 0xd2);
+
+                       /* if X==0, skip divide, make A=0 */
+
+                       /* cmp r9, 0 */
+                       EMIT4(0x49, 0x83, 0xF9, 0x00);
+
+                       /* je .+3 */
+                       EMIT2(X86_JE, 3);
+
+                       /* div r9 */
+                       EMIT3(0x49, 0xF7, 0xF1);
+
+                       if (BPF_OP(insn->code) == BPF_MOD) {
+                               /* mov r9, rdx */
+                               EMIT3(0x49, 0x89, 0xD1);
+                       } else {
+                               /* mov r9, rax */
+                               EMIT3(0x49, 0x89, 0xC1);
+                       }
+
+                       EMIT1(0x5A); /* pop rdx */
+                       EMIT1(0x58); /* pop rax */
+
+                       /* mov A, r9 */
+                       EMIT_mov(a_reg, AUX_REG);
+                       break;
+
+                       /* shifts */
+               case BPF_ALU | BPF_LSH | BPF_K:
+               case BPF_ALU | BPF_RSH | BPF_K:
+               case BPF_ALU | BPF_ARSH | BPF_K:
+                       b1 = add_1mod(0x48, a_reg);
+                       switch (BPF_OP(insn->code)) {
+                       case BPF_LSH: b3 = 0xE0; break;
+                       case BPF_RSH: b3 = 0xE8; break;
+                       case BPF_ARSH: b3 = 0xF8; break;
+                       }
+                       EMIT4(b1, 0xC1, add_1reg(b3, a_reg), K);
+                       break;
+
+               case BPF_ALU | BPF_BSWAP32 | BPF_X:
+                       /* emit 'bswap eax' to swap lower 4-bytes */
+                       if (is_ereg(a_reg))
+                               EMIT2(0x41, 0x0F);
+                       else
+                               EMIT1(0x0F);
+                       EMIT1(add_1reg(0xC8, a_reg));
+                       break;
+
+               case BPF_ALU | BPF_BSWAP64 | BPF_X:
+                       /* emit 'bswap rax' to swap 8-bytes */
+                       EMIT3(add_1mod(0x48, a_reg), 0x0F,
+                             add_1reg(0xC8, a_reg));
+                       break;
+
+                       /* ST: *(u8*)(a_reg + off) = imm */
+               case BPF_ST | BPF_REL | BPF_B:
+                       if (is_ereg(a_reg))
+                               EMIT2(0x41, 0xC6);
+                       else
+                               EMIT1(0xC6);
+                       goto st;
+               case BPF_ST | BPF_REL | BPF_H:
+                       if (is_ereg(a_reg))
+                               EMIT3(0x66, 0x41, 0xC7);
+                       else
+                               EMIT2(0x66, 0xC7);
+                       goto st;
+               case BPF_ST | BPF_REL | BPF_W:
+                       if (is_ereg(a_reg))
+                               EMIT2(0x41, 0xC7);
+                       else
+                               EMIT1(0xC7);
+                       goto st;
+               case BPF_ST | BPF_REL | BPF_DW:
+                       EMIT2(add_1mod(0x48, a_reg), 0xC7);
+
+st:                    if (is_imm8(insn->off))
+                               EMIT2(add_1reg(0x40, a_reg), insn->off);
+                       else
+                               EMIT1_off32(add_1reg(0x80, a_reg), insn->off);
+
+                       EMIT(K, bpf_size_to_x86_bytes(BPF_SIZE(insn->code)));
+                       break;
+
+                       /* STX: *(u8*)(a_reg + off) = x_reg */
+               case BPF_STX | BPF_REL | BPF_B:
+                       /* emit 'mov byte ptr [rax + off], al' */
+                       if (is_ereg(a_reg) || is_ereg(x_reg) ||
+                           /* have to add extra byte for x86 SIL, DIL regs */
+                           x_reg == R1 || x_reg == R2)
+                               EMIT2(add_2mod(0x40, a_reg, x_reg), 0x88);
+                       else
+                               EMIT1(0x88);
+                       goto stx;
+               case BPF_STX | BPF_REL | BPF_H:
+                       if (is_ereg(a_reg) || is_ereg(x_reg))
+                               EMIT3(0x66, add_2mod(0x40, a_reg, x_reg), 0x89);
+                       else
+                               EMIT2(0x66, 0x89);
+                       goto stx;
+               case BPF_STX | BPF_REL | BPF_W:
+                       if (is_ereg(a_reg) || is_ereg(x_reg))
+                               EMIT2(add_2mod(0x40, a_reg, x_reg), 0x89);
+                       else
+                               EMIT1(0x89);
+                       goto stx;
+               case BPF_STX | BPF_REL | BPF_DW:
+                       EMIT2(add_2mod(0x48, a_reg, x_reg), 0x89);
+stx:                   if (is_imm8(insn->off))
+                               EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off);
+                       else
+                               EMIT1_off32(add_2reg(0x80, a_reg, x_reg),
+                                           insn->off);
+                       break;
+
+                       /* LDX: a_reg = *(u8*)(x_reg + off) */
+               case BPF_LDX | BPF_REL | BPF_B:
+                       /* emit 'movzx rax, byte ptr [rax + off]' */
+                       EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB6);
+                       goto ldx;
+               case BPF_LDX | BPF_REL | BPF_H:
+                       /* emit 'movzx rax, word ptr [rax + off]' */
+                       EMIT3(add_2mod(0x48, x_reg, a_reg), 0x0F, 0xB7);
+                       goto ldx;
+               case BPF_LDX | BPF_REL | BPF_W:
+                       /* emit 'mov eax, dword ptr [rax+0x14]' */
+                       if (is_ereg(a_reg) || is_ereg(x_reg))
+                               EMIT2(add_2mod(0x40, x_reg, a_reg), 0x8B);
+                       else
+                               EMIT1(0x8B);
+                       goto ldx;
+               case BPF_LDX | BPF_REL | BPF_DW:
+                       /* emit 'mov rax, qword ptr [rax+0x14]' */
+                       EMIT2(add_2mod(0x48, x_reg, a_reg), 0x8B);
+ldx:                   /* if insn->off == 0 we can save one extra byte, but
+                        * special case of x86 R13 which always needs an offset
+                        * is not worth the pain */
+                       if (is_imm8(insn->off))
+                               EMIT2(add_2reg(0x40, x_reg, a_reg), insn->off);
+                       else
+                               EMIT1_off32(add_2reg(0x80, x_reg, a_reg),
+                                           insn->off);
+                       break;
+
+                       /* STX XADD: lock *(u8*)(a_reg + off) += x_reg */
+               case BPF_STX | BPF_XADD | BPF_B:
+                       /* emit 'lock add byte ptr [rax + off], al' */
+                       if (is_ereg(a_reg) || is_ereg(x_reg) ||
+                           /* have to add extra byte for x86 SIL, DIL regs */
+                           x_reg == R1 || x_reg == R2)
+                               EMIT3(0xF0, add_2mod(0x40, a_reg, x_reg), 0x00);
+                       else
+                               EMIT2(0xF0, 0x00);
+                       goto xadd;
+               case BPF_STX | BPF_XADD | BPF_H:
+                       if (is_ereg(a_reg) || is_ereg(x_reg))
+                               EMIT4(0x66, 0xF0, add_2mod(0x40, a_reg, x_reg),
+                                     0x01);
+                       else
+                               EMIT3(0x66, 0xF0, 0x01);
+                       goto xadd;
+               case BPF_STX | BPF_XADD | BPF_W:
+                       if (is_ereg(a_reg) || is_ereg(x_reg))
+                               EMIT3(0xF0, add_2mod(0x40, a_reg, x_reg), 0x01);
+                       else
+                               EMIT2(0xF0, 0x01);
+                       goto xadd;
+               case BPF_STX | BPF_XADD | BPF_DW:
+                       EMIT3(0xF0, add_2mod(0x48, a_reg, x_reg), 0x01);
+xadd:                  if (is_imm8(insn->off))
+                               EMIT2(add_2reg(0x40, a_reg, x_reg), insn->off);
+                       else
+                               EMIT1_off32(add_2reg(0x80, a_reg, x_reg),
+                                           insn->off);
+                       break;
+
+                       /* call */
+               case BPF_JMP | BPF_CALL:
+                       func = select_bpf_func(bpf_prog, K);
+                       jmp_offset = func - (image + addrs[i]);
+                       if (!func || !is_simm32(jmp_offset)) {
+                               pr_err("unsupported bpf func %d addr %p image 
%p\n",
+                                      K, func, image);
+                               return -EINVAL;
+                       }
+                       EMIT1_off32(0xE8, jmp_offset);
+                       break;
+
+                       /* cond jump */
+               case BPF_JMP | BPF_JEQ | BPF_X:
+               case BPF_JMP | BPF_JNE | BPF_X:
+               case BPF_JMP | BPF_JGT | BPF_X:
+               case BPF_JMP | BPF_JGE | BPF_X:
+               case BPF_JMP | BPF_JSGT | BPF_X:
+               case BPF_JMP | BPF_JSGE | BPF_X:
+                       /* emit 'cmp a_reg, x_reg' insn */
+                       b1 = 0x48;
+                       b2 = 0x39;
+                       b3 = 0xC0;
+                       EMIT3(add_2mod(b1, a_reg, x_reg), b2,
+                             add_2reg(b3, a_reg, x_reg));
+                       goto emit_jump;
+               case BPF_JMP | BPF_JEQ | BPF_K:
+               case BPF_JMP | BPF_JNE | BPF_K:
+               case BPF_JMP | BPF_JGT | BPF_K:
+               case BPF_JMP | BPF_JGE | BPF_K:
+               case BPF_JMP | BPF_JSGT | BPF_K:
+               case BPF_JMP | BPF_JSGE | BPF_K:
+                       /* emit 'cmp a_reg, imm8/32' */
+                       EMIT1(add_1mod(0x48, a_reg));
+
+                       if (is_imm8(K))
+                               EMIT3(0x83, add_1reg(0xF8, a_reg), K);
+                       else
+                               EMIT2_off32(0x81, add_1reg(0xF8, a_reg), K);
+
+emit_jump:             /* convert BPF opcode to x86 */
+                       switch (BPF_OP(insn->code)) {
+                       case BPF_JEQ:
+                               jmp_cond = X86_JE;
+                               break;
+                       case BPF_JNE:
+                               jmp_cond = X86_JNE;
+                               break;
+                       case BPF_JGT:
+                               /* GT is unsigned '>', JA in x86 */
+                               jmp_cond = X86_JA;
+                               break;
+                       case BPF_JGE:
+                               /* GE is unsigned '>=', JAE in x86 */
+                               jmp_cond = X86_JAE;
+                               break;
+                       case BPF_JSGT:
+                               /* signed '>', GT in x86 */
+                               jmp_cond = X86_JG;
+                               break;
+                       case BPF_JSGE:
+                               /* signed '>=', GE in x86 */
+                               jmp_cond = X86_JGE;
+                               break;
+                       default: /* to silence gcc warning */
+                               return -EFAULT;
+                       }
+                       jmp_offset = addrs[i + insn->off] - addrs[i];
+                       if (is_imm8(jmp_offset)) {
+                               EMIT2(jmp_cond, jmp_offset);
+                       } else if (is_simm32(jmp_offset)) {
+                               EMIT2_off32(0x0F, jmp_cond + 0x10, jmp_offset);
+                       } else {
+                               pr_err("cond_jmp gen bug %llx\n", jmp_offset);
+                               return -EFAULT;
+                       }
+
+                       break;
+
+               case BPF_JMP | BPF_JA | BPF_X:
+                       jmp_offset = addrs[i + insn->off] - addrs[i];
+                       if (is_imm8(jmp_offset)) {
+                               EMIT2(0xEB, jmp_offset);
+                       } else if (is_simm32(jmp_offset)) {
+                               EMIT1_off32(0xE9, jmp_offset);
+                       } else {
+                               pr_err("jmp gen bug %llx\n", jmp_offset);
+                               return -EFAULT;
+                       }
+
+                       break;
+
+               case BPF_RET | BPF_K:
+                       /* mov rbx, qword ptr [rbp-X] */
+                       EMIT3_off32(0x48, 0x8B, 0x9D, -stacksize);
+                       /* mov r13, qword ptr [rbp-X] */
+                       EMIT3_off32(0x4C, 0x8B, 0xAD, -stacksize + 8);
+                       /* mov r14, qword ptr [rbp-X] */
+                       EMIT3_off32(0x4C, 0x8B, 0xB5, -stacksize + 16);
+                       /* mov r15, qword ptr [rbp-X] */
+                       EMIT3_off32(0x4C, 0x8B, 0xBD, -stacksize + 24);
+
+                       EMIT1(0xC9); /* leave */
+                       EMIT1(0xC3); /* ret */
+                       break;
+
+               default:
+                       /*pr_debug_bpf_insn(insn, NULL);*/
+                       pr_err("bpf_jit: unknown opcode %02x\n", insn->code);
+                       return -EINVAL;
+               }
+
+               ilen = prog - temp;
+               if (image) {
+                       if (proglen + ilen > oldproglen)
+                               return -2;
+                       memcpy(image + proglen, temp, ilen);
+               }
+               proglen += ilen;
+               addrs[i] = proglen;
+               prog = temp;
+       }
+       return proglen;
+}
+
+void bpf_compile(struct bpf_program *prog)
+{
+       struct bpf_binary_header *header = NULL;
+       int proglen, oldproglen = 0;
+       int *addrs;
+       u8 *image = NULL;
+       int pass;
+       int i;
+
+       if (!prog || !prog->cb || !prog->cb->jit_select_func)
+               return;
+
+       addrs = kmalloc(prog->insn_cnt * sizeof(*addrs), GFP_KERNEL);
+       if (!addrs)
+               return;
+
+       for (proglen = 0, i = 0; i < prog->insn_cnt; i++) {
+               proglen += 64;
+               addrs[i] = proglen;
+       }
+       for (pass = 0; pass < 10; pass++) {
+               proglen = do_jit(prog, addrs, image, oldproglen);
+               if (proglen <= 0) {
+                       image = NULL;
+                       goto out;
+               }
+               if (image) {
+                       if (proglen != oldproglen)
+                               pr_err("bpf_jit: proglen=%d != oldproglen=%d\n",
+                                      proglen, oldproglen);
+                       break;
+               }
+               if (proglen == oldproglen) {
+                       header = bpf_alloc_binary(proglen, &image);
+                       if (!header)
+                               goto out;
+               }
+               oldproglen = proglen;
+       }
+
+       if (image) {
+               bpf_flush_icache(header, image + proglen);
+               set_memory_ro((unsigned long)header, header->pages);
+       }
+out:
+       kfree(addrs);
+       prog->jit_image = (void (*)(struct bpf_context *ctx))image;
+       return;
+}
+
+static void bpf_jit_free_deferred(struct work_struct *work)
+{
+       struct bpf_program *prog = container_of(work, struct bpf_program, work);
+       unsigned long addr = (unsigned long)prog->jit_image & PAGE_MASK;
+       struct bpf_binary_header *header = (void *)addr;
+
+       set_memory_rw(addr, header->pages);
+       module_free(NULL, header);
+       free_bpf_program(prog);
+}
+
+void __bpf_free(struct bpf_program *prog)
+{
+       if (prog->jit_image) {
+               INIT_WORK(&prog->work, bpf_jit_free_deferred);
+               schedule_work(&prog->work);
+       } else {
+               free_bpf_program(prog);
+       }
+}
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 4ed75dd..f9ece1e 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -13,6 +13,7 @@
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
 #include <linux/random.h>
+#include "bpf_jit_comp.h"
 
 /*
  * Conventions :
@@ -112,16 +113,6 @@ do {                                                       
        \
 #define SEEN_XREG    2 /* ebx is used */
 #define SEEN_MEM     4 /* use mem[] for temporary storage */
 
-static inline void bpf_flush_icache(void *start, void *end)
-{
-       mm_segment_t old_fs = get_fs();
-
-       set_fs(KERNEL_DS);
-       smp_wmb();
-       flush_icache_range((unsigned long)start, (unsigned long)end);
-       set_fs(old_fs);
-}
-
 #define CHOOSE_LOAD_FUNC(K, func) \
        ((int)K < 0 ? ((int)K >= SKF_LL_OFF ? func##_negative_offset : func) : 
func##_positive_offset)
 
@@ -145,16 +136,8 @@ static int pkt_type_offset(void)
        return -1;
 }
 
-struct bpf_binary_header {
-       unsigned int    pages;
-       /* Note : for security reasons, bpf code will follow a randomly
-        * sized amount of int3 instructions
-        */
-       u8              image[];
-};
-
-static struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
-                                                 u8 **image_ptr)
+struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
+                                          u8 **image_ptr)
 {
        unsigned int sz, hole;
        struct bpf_binary_header *header;
diff --git a/arch/x86/net/bpf_jit_comp.h b/arch/x86/net/bpf_jit_comp.h
new file mode 100644
index 0000000..74ff45d
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp.h
@@ -0,0 +1,35 @@
+/* bpf_jit_comp.h : BPF filter alloc/free routines
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+#ifndef __BPF_JIT_COMP_H
+#define __BPF_JIT_COMP_H
+
+#include <linux/uaccess.h>
+#include <asm/cacheflush.h>
+
+struct bpf_binary_header {
+       unsigned int    pages;
+       /* Note : for security reasons, bpf code will follow a randomly
+        * sized amount of int3 instructions
+        */
+       u8              image[];
+};
+
+static inline void bpf_flush_icache(void *start, void *end)
+{
+       mm_segment_t old_fs = get_fs();
+
+       set_fs(KERNEL_DS);
+       smp_wmb();
+       flush_icache_range((unsigned long)start, (unsigned long)end);
+       set_fs(old_fs);
+}
+
+struct bpf_binary_header *bpf_alloc_binary(unsigned int proglen,
+                                          u8 **image_ptr);
+
+#endif
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC PATCH v2 tip 2/7] Extended BPF JIT for x86-64

Reply via email to