On 1/11/2018 3:46 PM, David Woodhouse wrote:
> Enable the use of -mindirect-branch=thunk-extern in newer GCC, and provide
> the corresponding thunks. Provide assembler macros for invoking the thunks
> in the same way that GCC does, from native and inline assembler.
> 
> This adds X86_FEATURE_RETPOLINE and sets it by default on all CPUs. In
> some circumstances, IBRS microcode features may be used instead, and the
> retpoline can be disabled.
> 
> On AMD CPUs if lfence is serialising, the retpoline can be dramatically
> simplified to a simple "lfence; jmp *\reg". A future patch, after it has
> been verified that lfence really is serialising in all circumstances, can
> enable this by setting the X86_FEATURE_RETPOLINE_AMD feature bit in addition
> to X86_FEATURE_RETPOLINE.
> 
> Do not align the retpoline in the altinstr section, because there is no
> guarantee that it stays aligned when it's copied over the oldinstr during
> alternative patching.
> 
> [ Andi Kleen: Rename the macros, add CONFIG_RETPOLINE option, export thunks]
> [ tglx: Put actual function CALL/JMP in front of the macros, convert to
>       symbolic labels ]
> [ dwmw2: Convert back to numeric labels, merge objtool fixes ]
> 
> Signed-off-by: David Woodhouse <[email protected]>
> Signed-off-by: Thomas Gleixner <[email protected]>
> Acked-by: Arjan van de Ven <[email protected]>
> Acked-by: Ingo Molnar <[email protected]>
> Cc: [email protected]
> Cc: Rik van Riel <[email protected]>
> Cc: Andi Kleen <[email protected]>
> Cc: Peter Zijlstra <[email protected]>
> Cc: Linus Torvalds <[email protected]>
> Cc: Jiri Kosina <[email protected]>
> Cc: Andy Lutomirski <[email protected]>
> Cc: Dave Hansen <[email protected]>
> Cc: Kees Cook <[email protected]>
> Cc: Tim Chen <[email protected]>
> Cc: Greg Kroah-Hartman <[email protected]>
> Cc: Paul Turner <[email protected]>
> Link: 
> https://lkml.kernel.org/r/[email protected]
> ---
>  arch/x86/Kconfig                      |  13 ++++
>  arch/x86/Makefile                     |  10 +++
>  arch/x86/include/asm/asm-prototypes.h |  25 +++++++
>  arch/x86/include/asm/cpufeatures.h    |   2 +
>  arch/x86/include/asm/nospec-branch.h  | 128 
> ++++++++++++++++++++++++++++++++++
>  arch/x86/kernel/cpu/common.c          |   4 ++
>  arch/x86/lib/Makefile                 |   1 +
>  arch/x86/lib/retpoline.S              |  48 +++++++++++++
>  8 files changed, 231 insertions(+)
>  create mode 100644 arch/x86/include/asm/nospec-branch.h
>  create mode 100644 arch/x86/lib/retpoline.S
> 

...

> diff --git a/arch/x86/include/asm/nospec-branch.h 
> b/arch/x86/include/asm/nospec-branch.h
> new file mode 100644
> index 0000000..e20e92e
> --- /dev/null
> +++ b/arch/x86/include/asm/nospec-branch.h
> @@ -0,0 +1,128 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef __NOSPEC_BRANCH_H__
> +#define __NOSPEC_BRANCH_H__
> +
> +#include <asm/alternative.h>
> +#include <asm/alternative-asm.h>
> +#include <asm/cpufeatures.h>
> +
> +#ifdef __ASSEMBLY__
> +
> +/*
> + * This should be used immediately before a retpoline alternative.  It tells
> + * objtool where the retpolines are so that it can make sense of the control
> + * flow by just reading the original instruction(s) and ignoring the
> + * alternatives.
> + */
> +.macro ANNOTATE_NOSPEC_ALTERNATIVE
> +     .Lannotate_\@:
> +     .pushsection .discard.nospec
> +     .long .Lannotate_\@ - .
> +     .popsection
> +.endm
> +
> +/*
> + * These are the bare retpoline primitives for indirect jmp and call.
> + * Do not use these directly; they only exist to make the ALTERNATIVE
> + * invocation below less ugly.
> + */
> +.macro RETPOLINE_JMP reg:req
> +     call    .Ldo_rop_\@
> +.Lspec_trap_\@:
> +     pause

Talked with our engineers some more on using pause vs. lfence.  Pause is
not serializing on AMD, so the pause/jmp loop will use power as it is
speculated over waiting for return to mispredict to the correct target.
Can this be changed back to lfence?  It looked like a very small
difference in cycles/time.

Thanks,
Tom

> +     jmp     .Lspec_trap_\@
> +.Ldo_rop_\@:
> +     mov     \reg, (%_ASM_SP)
> +     ret
> +.endm
> +
> +/*
> + * This is a wrapper around RETPOLINE_JMP so the called function in reg
> + * returns to the instruction after the macro.
> + */
> +.macro RETPOLINE_CALL reg:req
> +     jmp     .Ldo_call_\@
> +.Ldo_retpoline_jmp_\@:
> +     RETPOLINE_JMP \reg
> +.Ldo_call_\@:
> +     call    .Ldo_retpoline_jmp_\@
> +.endm
> +
> +/*
> + * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
> + * indirect jmp/call which may be susceptible to the Spectre variant 2
> + * attack.
> + */
> +.macro JMP_NOSPEC reg:req
> +#ifdef CONFIG_RETPOLINE
> +     ANNOTATE_NOSPEC_ALTERNATIVE
> +     ALTERNATIVE_2 __stringify(jmp *\reg),                           \
> +             __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \
> +             __stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
> +#else
> +     jmp     *\reg
> +#endif
> +.endm
> +
> +.macro CALL_NOSPEC reg:req
> +#ifdef CONFIG_RETPOLINE
> +     ANNOTATE_NOSPEC_ALTERNATIVE
> +     ALTERNATIVE_2 __stringify(call *\reg),                          \
> +             __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\
> +             __stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD
> +#else
> +     call    *\reg
> +#endif
> +.endm
> +
> +#else /* __ASSEMBLY__ */
> +
> +#define ANNOTATE_NOSPEC_ALTERNATIVE                          \
> +     "999:\n\t"                                              \
> +     ".pushsection .discard.nospec\n\t"                      \
> +     ".long 999b - .\n\t"                                    \
> +     ".popsection\n\t"
> +
> +#if defined(CONFIG_X86_64) && defined(RETPOLINE)
> +
> +/*
> + * Since the inline asm uses the %V modifier which is only in newer GCC,
> + * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE.
> + */
> +# define CALL_NOSPEC                                         \
> +     ANNOTATE_NOSPEC_ALTERNATIVE                             \
> +     ALTERNATIVE(                                            \
> +     "call *%[thunk_target]\n",                              \
> +     "call __x86_indirect_thunk_%V[thunk_target]\n",         \
> +     X86_FEATURE_RETPOLINE)
> +# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
> +
> +#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE)
> +/*
> + * For i386 we use the original ret-equivalent retpoline, because
> + * otherwise we'll run out of registers. We don't care about CET
> + * here, anyway.
> + */
> +# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n",  \
> +     "       jmp    904f;\n"                                 \
> +     "       .align 16\n"                                    \
> +     "901:   call   903f;\n"                                 \
> +     "902:   pause;\n"                                       \
> +     "       jmp    902b;\n"                                 \
> +     "       .align 16\n"                                    \
> +     "903:   addl   $4, %%esp;\n"                            \
> +     "       pushl  %[thunk_target];\n"                      \
> +     "       ret;\n"                                         \
> +     "       .align 16\n"                                    \
> +     "904:   call   901b;\n",                                \
> +     X86_FEATURE_RETPOLINE)
> +
> +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
> +#else /* No retpoline */
> +# define CALL_NOSPEC "call *%[thunk_target]\n"
> +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
> +#endif
> +
> +#endif /* __ASSEMBLY__ */
> +#endif /* __NOSPEC_BRANCH_H__ */
> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
> index 372ba3f..7a671d1 100644
> --- a/arch/x86/kernel/cpu/common.c
> +++ b/arch/x86/kernel/cpu/common.c
> @@ -905,6 +905,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 
> *c)
>       setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
>       setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
>  
> +#ifdef CONFIG_RETPOLINE
> +     setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
> +#endif
> +
>       fpu__init_system(c);
>  
>  #ifdef CONFIG_X86_32
> diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
> index 457f681..d435c89 100644
> --- a/arch/x86/lib/Makefile
> +++ b/arch/x86/lib/Makefile
> @@ -26,6 +26,7 @@ lib-y += memcpy_$(BITS).o
>  lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
>  lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
>  lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
> +lib-$(CONFIG_RETPOLINE) += retpoline.o
>  
>  obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
>  
> diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
> new file mode 100644
> index 0000000..cb45c6c
> --- /dev/null
> +++ b/arch/x86/lib/retpoline.S
> @@ -0,0 +1,48 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#include <linux/stringify.h>
> +#include <linux/linkage.h>
> +#include <asm/dwarf2.h>
> +#include <asm/cpufeatures.h>
> +#include <asm/alternative-asm.h>
> +#include <asm/export.h>
> +#include <asm/nospec-branch.h>
> +
> +.macro THUNK reg
> +     .section .text.__x86.indirect_thunk.\reg
> +
> +ENTRY(__x86_indirect_thunk_\reg)
> +     CFI_STARTPROC
> +     JMP_NOSPEC %\reg
> +     CFI_ENDPROC
> +ENDPROC(__x86_indirect_thunk_\reg)
> +.endm
> +
> +/*
> + * Despite being an assembler file we can't just use .irp here
> + * because __KSYM_DEPS__ only uses the C preprocessor and would
> + * only see one instance of "__x86_indirect_thunk_\reg" rather
> + * than one per register with the correct names. So we do it
> + * the simple and nasty way...
> + */
> +#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg)
> +#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg)
> +
> +GENERATE_THUNK(_ASM_AX)
> +GENERATE_THUNK(_ASM_BX)
> +GENERATE_THUNK(_ASM_CX)
> +GENERATE_THUNK(_ASM_DX)
> +GENERATE_THUNK(_ASM_SI)
> +GENERATE_THUNK(_ASM_DI)
> +GENERATE_THUNK(_ASM_BP)
> +GENERATE_THUNK(_ASM_SP)
> +#ifdef CONFIG_64BIT
> +GENERATE_THUNK(r8)
> +GENERATE_THUNK(r9)
> +GENERATE_THUNK(r10)
> +GENERATE_THUNK(r11)
> +GENERATE_THUNK(r12)
> +GENERATE_THUNK(r13)
> +GENERATE_THUNK(r14)
> +GENERATE_THUNK(r15)
> +#endif
> 

Reply via email to