By using int3 as a speculation fence instead of lfence, we can shrink
the longest alternative to just 15 bytes:

  0:   e8 05 00 00 00          callq  a <.altinstr_replacement+0xa>
  5:   f3 90                   pause  
  7:   cc                      int3   
  8:   eb fb                   jmp    5 <.altinstr_replacement+0x5>
  a:   48 89 04 24             mov    %rax,(%rsp)
  e:   c3                      retq   

This means we can change the alignment from 32 to 16 bytes and get 4
retpolines per cacheline, $I win.

Signed-off-by: Peter Zijlstra (Intel) <pet...@infradead.org>
---
 arch/x86/lib/retpoline.S |    4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -16,7 +16,7 @@
 .Lspec_trap_\@:
        UNWIND_HINT_EMPTY
        pause
-       lfence
+       int3
        jmp .Lspec_trap_\@
 .Ldo_rop_\@:
        mov     %\reg, (%_ASM_SP)
@@ -27,7 +27,7 @@
 .macro THUNK reg
        .section .text.__x86.indirect_thunk
 
-       .align 32
+       .align 16
 SYM_FUNC_START(__x86_indirect_thunk_\reg)
 
        ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *%\reg), \


Reply via email to