On Tue, Nov 21, 2023 at 04:48:29PM +0100, Mark Kettenis wrote: > > Date: Tue, 21 Nov 2023 16:08:13 +0100 > > From: Tobias Heider <tobias.hei...@stusta.de> > > > > On Tue, Nov 21, 2023 at 12:23:08PM +0100, Mark Kettenis wrote: > > > > Date: Tue, 21 Nov 2023 12:04:42 +0100 > > > > From: Tobias Heider <tobias.hei...@stusta.de> > > > > > > > > On Tue, Nov 21, 2023 at 11:56:18AM +0100, Mark Kettenis wrote: > > > > > > Date: Tue, 21 Nov 2023 00:16:40 +0100 > > > > > > From: Tobias Heider <tobias.hei...@stusta.de> > > > > > > > > > > > > Diff below fixes make regress for libffi with arm64 BTI enabled. > > > > > > The tricky part were two jump tables in ffi.c and sysV.S. > > > > > > > > > > > > ok? > > > > > > > > > > I think you missed the "computed goto" in ffi_closure_SYSV. > > > > > > > > > > Maybe we shouldn't add a "bti j" for the unused slots? > > > > > > > > Functionally yes, that should also work and makes more sense. > > > > But then the brks would be a bit redundant. I wasn't sure if there > > > > are tests or something that expects those breaks to be reachable. > > > > > > The brks still need to be there for those poor CPUs that don't > > > implement BTI. > > > > > > > Here is an updated diff where all reserved and unused entries > > don't get bti j. make regress still passes. > > Looks like you missed my comment about ffi_closure_SYSV.
urgh yes, missed that. fixed in diff below > Is the closures stuff not tested as part of make regress? it seems like a lot of tests are skipped in our port. I'll see if I can force it to run a few more. # of expected passes 5822 # of unsupported tests 209 Index: Makefile =================================================================== RCS file: /cvs/ports/devel/libffi/Makefile,v retrieving revision 1.48 diff -u -p -r1.48 Makefile --- Makefile 21 Sep 2023 09:49:57 -0000 1.48 +++ Makefile 21 Nov 2023 16:07:43 -0000 @@ -1,6 +1,7 @@ COMMENT= Foreign Function Interface V= 3.4.4 +REVISION= 0 DISTNAME= libffi-$V SHARED_LIBS += ffi 2.0 # 9.2 CATEGORIES= devel Index: patches/patch-src_aarch64_ffi_c =================================================================== RCS file: patches/patch-src_aarch64_ffi_c diff -N patches/patch-src_aarch64_ffi_c --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ patches/patch-src_aarch64_ffi_c 21 Nov 2023 16:07:43 -0000 @@ -0,0 +1,76 @@ +Index: src/aarch64/ffi.c +--- src/aarch64/ffi.c.orig ++++ src/aarch64/ffi.c +@@ -390,47 +390,59 @@ extend_hfa_type (void *dest, void *src, int h) + "adr %0, 0f\n" + " add %0, %0, %1\n" + " br %0\n" +-"0: ldp s16, s17, [%3]\n" /* S4 */ ++"0: bti j\n" /* S4 */ ++" ldp s16, s17, [%3]\n" + " ldp s18, s19, [%3, #8]\n" + " b 4f\n" +-" ldp s16, s17, [%3]\n" /* S3 */ ++" bti j\n" /* S3 */ ++" ldp s16, s17, [%3]\n" + " ldr s18, [%3, #8]\n" + " b 3f\n" +-" ldp s16, s17, [%3]\n" /* S2 */ ++" bti j\n" /* S2 */ ++" ldp s16, s17, [%3]\n" + " b 2f\n" + " nop\n" +-" ldr s16, [%3]\n" /* S1 */ ++" bti j\n" /* S1 */ ++" ldr s16, [%3]\n" + " b 1f\n" + " nop\n" +-" ldp d16, d17, [%3]\n" /* D4 */ ++" bti j\n" /* D4 */ ++" ldp d16, d17, [%3]\n" + " ldp d18, d19, [%3, #16]\n" + " b 4f\n" +-" ldp d16, d17, [%3]\n" /* D3 */ ++" bti j\n" /* D3 */ ++" ldp d16, d17, [%3]\n" + " ldr d18, [%3, #16]\n" + " b 3f\n" +-" ldp d16, d17, [%3]\n" /* D2 */ ++" bti j\n" /* D2 */ ++" ldp d16, d17, [%3]\n" + " b 2f\n" + " nop\n" +-" ldr d16, [%3]\n" /* D1 */ ++" bti j\n" /* D1 */ ++" ldr d16, [%3]\n" + " b 1f\n" + " nop\n" +-" ldp q16, q17, [%3]\n" /* Q4 */ ++" bti j\n" /* Q4 */ ++" ldp q16, q17, [%3]\n" + " ldp q18, q19, [%3, #32]\n" + " b 4f\n" +-" ldp q16, q17, [%3]\n" /* Q3 */ ++" bti j\n" /* Q3 */ ++" ldp q16, q17, [%3]\n" + " ldr q18, [%3, #32]\n" + " b 3f\n" +-" ldp q16, q17, [%3]\n" /* Q2 */ ++" bti j\n" /* Q2 */ ++" ldp q16, q17, [%3]\n" + " b 2f\n" + " nop\n" +-" ldr q16, [%3]\n" /* Q1 */ ++" bti j\n" /* Q1 */ ++" ldr q16, [%3]\n" + " b 1f\n" + "4: str q19, [%2, #48]\n" + "3: str q18, [%2, #32]\n" + "2: str q17, [%2, #16]\n" + "1: str q16, [%2]" + : "=&r"(x0) +- : "r"(f * 12), "r"(dest), "r"(src) ++ : "r"(f * 16), "r"(dest), "r"(src) + : "memory", "v16", "v17", "v18", "v19"); + } + #endif Index: patches/patch-src_aarch64_sysv_S =================================================================== RCS file: patches/patch-src_aarch64_sysv_S diff -N patches/patch-src_aarch64_sysv_S --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ patches/patch-src_aarch64_sysv_S 21 Nov 2023 16:07:43 -0000 @@ -0,0 +1,369 @@ +Index: src/aarch64/sysv.S +--- src/aarch64/sysv.S.orig ++++ src/aarch64/sysv.S +@@ -78,6 +78,7 @@ SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + + cfi_startproc + CNAME(ffi_call_SYSV): ++ bti c + /* Sign the lr with x1 since that is where it will be stored */ + SIGN_LR_WITH_REG(x1) + +@@ -138,78 +139,142 @@ CNAME(ffi_call_SYSV): + /* Save the return value as directed. */ + adr x5, 0f + and w4, w4, #AARCH64_RET_MASK +- add x5, x5, x4, lsl #3 ++ add x5, x5, x4, lsl #4 + br x5 + +- /* Note that each table entry is 2 insns, and thus 8 bytes. ++ /* Note that each table entry is 4 insns, and thus 16 bytes. + For integer data, note that we're storing into ffi_arg + and therefore we want to extend to 64 bits; these types + have two consecutive entries allocated for them. */ + .align 4 +-0: b 99f /* VOID */ ++0: bti j /* VOID */ ++ b 99f ++ nop + nop +-1: str x0, [x3] /* INT64 */ ++1: bti j /* INT64 */ ++ str x0, [x3] + b 99f +-2: stp x0, x1, [x3] /* INT128 */ ++ nop ++2: bti j /* INT128 */ ++ stp x0, x1, [x3] + b 99f ++ nop + 3: brk #1000 /* UNUSED */ + b 99f ++ nop ++ nop + 4: brk #1000 /* UNUSED */ + b 99f ++ nop ++ nop + 5: brk #1000 /* UNUSED */ + b 99f ++ nop ++ nop + 6: brk #1000 /* UNUSED */ + b 99f ++ nop ++ nop + 7: brk #1000 /* UNUSED */ + b 99f +-8: st4 { v0.s, v1.s, v2.s, v3.s }[0], [x3] /* S4 */ ++ nop ++ nop ++8: bti j /* S4 */ ++ st4 { v0.s, v1.s, v2.s, v3.s }[0], [x3] + b 99f +-9: st3 { v0.s, v1.s, v2.s }[0], [x3] /* S3 */ ++ nop ++9: bti j /* S3 */ ++ st3 { v0.s, v1.s, v2.s }[0], [x3] + b 99f +-10: stp s0, s1, [x3] /* S2 */ ++ nop ++10: bti j /* S2 */ ++ stp s0, s1, [x3] + b 99f +-11: str s0, [x3] /* S1 */ ++ nop ++11: bti j ++ str s0, [x3] /* S1 */ + b 99f +-12: st4 { v0.d, v1.d, v2.d, v3.d }[0], [x3] /* D4 */ ++ nop ++12: bti j /* D4 */ ++ st4 { v0.d, v1.d, v2.d, v3.d }[0], [x3] + b 99f +-13: st3 { v0.d, v1.d, v2.d }[0], [x3] /* D3 */ ++ nop ++13: bti j /* D3 */ ++ st3 { v0.d, v1.d, v2.d }[0], [x3] + b 99f +-14: stp d0, d1, [x3] /* D2 */ ++ nop ++14: bti j /* D2 */ ++ stp d0, d1, [x3] + b 99f +-15: str d0, [x3] /* D1 */ ++ nop ++15: bti j /* D1 */ ++ str d0, [x3] + b 99f +-16: str q3, [x3, #48] /* Q4 */ + nop +-17: str q2, [x3, #32] /* Q3 */ ++16: bti j /* Q4 */ ++ str q3, [x3, #48] + nop +-18: stp q0, q1, [x3] /* Q2 */ ++ nop ++17: bti j /* Q3 */ ++ str q2, [x3, #32] ++ nop ++ nop ++18: bti j /* Q2 */ ++ stp q0, q1, [x3] + b 99f +-19: str q0, [x3] /* Q1 */ ++ nop ++19: bti j /* Q1 */ ++ str q0, [x3] + b 99f +-20: uxtb w0, w0 /* UINT8 */ ++ nop ++20: bti j /* UINT8 */ ++ uxtb w0, w0 + str x0, [x3] ++ nop + 21: b 99f /* reserved */ + nop +-22: uxth w0, w0 /* UINT16 */ ++ nop ++ nop ++22: bti j /* UINT16 */ ++ uxth w0, w0 + str x0, [x3] ++ nop + 23: b 99f /* reserved */ + nop +-24: mov w0, w0 /* UINT32 */ ++ nop ++ nop ++24: bti j /* UINT32 */ ++ mov w0, w0 + str x0, [x3] ++ nop + 25: b 99f /* reserved */ + nop +-26: sxtb x0, w0 /* SINT8 */ ++ nop ++ nop ++26: bti j /* SINT8 */ ++ sxtb x0, w0 + str x0, [x3] ++ nop + 27: b 99f /* reserved */ + nop +-28: sxth x0, w0 /* SINT16 */ ++ nop ++ nop ++28: bti j /* SINT16 */ ++ sxth x0, w0 + str x0, [x3] ++ nop + 29: b 99f /* reserved */ + nop +-30: sxtw x0, w0 /* SINT32 */ ++ nop ++ nop ++30: bti j /* SINT32 */ ++ sxtw x0, w0 + str x0, [x3] ++ nop + 31: b 99f /* reserved */ + nop ++ nop ++ nop + + /* Return now that result has been populated. */ + 99: +@@ -246,6 +311,7 @@ CNAME(ffi_call_SYSV): + .align 4 + CNAME(ffi_closure_SYSV_V): + cfi_startproc ++ bti c + SIGN_LR + stp x29, x30, [sp, #-ffi_closure_SYSV_FS]! + cfi_adjust_cfa_offset (ffi_closure_SYSV_FS) +@@ -270,6 +336,7 @@ CNAME(ffi_closure_SYSV_V): + .align 4 + cfi_startproc + CNAME(ffi_closure_SYSV): ++ bti c + SIGN_LR + stp x29, x30, [sp, #-ffi_closure_SYSV_FS]! + cfi_adjust_cfa_offset (ffi_closure_SYSV_FS) +@@ -299,74 +366,136 @@ CNAME(ffi_closure_SYSV): + /* Load the return value as directed. */ + adr x1, 0f + and w0, w0, #AARCH64_RET_MASK +- add x1, x1, x0, lsl #3 ++ add x1, x1, x0, lsl #4 + add x3, sp, #16+CALL_CONTEXT_SIZE + br x1 + +- /* Note that each table entry is 2 insns, and thus 8 bytes. */ ++ /* Note that each table entry is 4 insns, and thus 16 bytes. */ + .align 4 +-0: b 99f /* VOID */ ++0: bti j /* VOID */ ++ b 99f + nop +-1: ldr x0, [x3] /* INT64 */ ++ nop ++1: bti j /* INT64 */ ++ ldr x0, [x3] + b 99f +-2: ldp x0, x1, [x3] /* INT128 */ ++ nop ++2: bti j /* INT128 */ ++ ldp x0, x1, [x3] + b 99f ++ nop + 3: brk #1000 /* UNUSED */ + nop ++ nop ++ nop + 4: brk #1000 /* UNUSED */ + nop ++ nop ++ nop + 5: brk #1000 /* UNUSED */ + nop ++ nop ++ nop + 6: brk #1000 /* UNUSED */ + nop ++ nop ++ nop + 7: brk #1000 /* UNUSED */ + nop +-8: ldr s3, [x3, #12] /* S4 */ + nop +-9: ldr s2, [x3, #8] /* S3 */ + nop +-10: ldp s0, s1, [x3] /* S2 */ ++8: bti j /* S4 */ ++ ldr s3, [x3, #12] ++ nop ++ nop ++9: bti j /* S3 */ ++ ldr s2, [x3, #8] ++ nop ++ nop ++10: bti j /* S2 */ ++ ldp s0, s1, [x3] + b 99f +-11: ldr s0, [x3] /* S1 */ ++ nop ++11: bti j /* S1 */ ++ ldr s0, [x3] + b 99f +-12: ldr d3, [x3, #24] /* D4 */ + nop +-13: ldr d2, [x3, #16] /* D3 */ ++12: bti j /* D4 */ ++ ldr d3, [x3, #24] + nop +-14: ldp d0, d1, [x3] /* D2 */ ++ nop ++13: bti j /* D3 */ ++ ldr d2, [x3, #16] ++ nop ++ nop ++14: bti j /* D2 */ ++ ldp d0, d1, [x3] + b 99f +-15: ldr d0, [x3] /* D1 */ ++ nop ++15: bti j /* D1 */ ++ ldr d0, [x3] + b 99f +-16: ldr q3, [x3, #48] /* Q4 */ + nop +-17: ldr q2, [x3, #32] /* Q3 */ ++16: bti j /* Q4 */ ++ ldr q3, [x3, #48] + nop +-18: ldp q0, q1, [x3] /* Q2 */ ++ nop ++17: bti j /* Q3 */ ++ ldr q2, [x3, #32] ++ nop ++ nop ++18: bti j /* Q2 */ ++ ldp q0, q1, [x3] + b 99f +-19: ldr q0, [x3] /* Q1 */ ++ nop ++19: bti j /* Q1 */ ++ ldr q0, [x3] + b 99f +-20: ldrb w0, [x3, #BE(7)] /* UINT8 */ ++ nop ++20: bti j /* UINT8 */ ++ ldrb w0, [x3, #BE(7)] + b 99f ++ nop + 21: brk #1000 /* reserved */ + nop +-22: ldrh w0, [x3, #BE(6)] /* UINT16 */ ++ nop ++ nop ++22: bti j /* UINT16 */ ++ ldrh w0, [x3, #BE(6)] + b 99f ++ nop + 23: brk #1000 /* reserved */ + nop +-24: ldr w0, [x3, #BE(4)] /* UINT32 */ ++ nop ++ nop ++24: bti j /* UINT32 */ ++ ldr w0, [x3, #BE(4)] + b 99f ++ nop + 25: brk #1000 /* reserved */ + nop +-26: ldrsb x0, [x3, #BE(7)] /* SINT8 */ ++ nop ++ nop ++26: bti j /* SINT8 */ ++ ldrsb x0, [x3, #BE(7)] + b 99f ++ nop + 27: brk #1000 /* reserved */ + nop +-28: ldrsh x0, [x3, #BE(6)] /* SINT16 */ ++ nop ++ nop ++28: bti j /* SINT16 */ ++ ldrsh x0, [x3, #BE(6)] + b 99f ++ nop + 29: brk #1000 /* reserved */ + nop +-30: ldrsw x0, [x3, #BE(4)] /* SINT32 */ + nop ++ nop ++30: bti j /* SINT32 */ ++ ldrsw x0, [x3, #BE(4)] ++ nop ++ nop + 31: /* reserved */ + 99: ldp x29, x30, [sp], #ffi_closure_SYSV_FS + cfi_adjust_cfa_offset (-ffi_closure_SYSV_FS) +@@ -479,6 +608,7 @@ CNAME(ffi_closure_trampoline_table_page): + .align 4 + CNAME(ffi_go_closure_SYSV_V): + cfi_startproc ++ bti c + stp x29, x30, [sp, #-ffi_closure_SYSV_FS]! + cfi_adjust_cfa_offset (ffi_closure_SYSV_FS) + cfi_rel_offset (x29, 0) +@@ -502,6 +632,7 @@ CNAME(ffi_go_closure_SYSV_V): + .align 4 + cfi_startproc + CNAME(ffi_go_closure_SYSV): ++ bti c + stp x29, x30, [sp, #-ffi_closure_SYSV_FS]! + cfi_adjust_cfa_offset (ffi_closure_SYSV_FS) + cfi_rel_offset (x29, 0)