paranoid_entry/exit are assembly macros. Provide C versions of these macros (kernel_paranoid_entry() and kernel_paranoid_exit()). The C functions are functionally equivalent to the assembly macros, except that kernel_paranoid_entry() doesn't save registers in pt_regs like paranoid_entry does.
Signed-off-by: Alexandre Chartre <alexandre.char...@oracle.com> --- arch/x86/entry/common.c | 157 ++++++++++++++++++++++++++++ arch/x86/include/asm/entry-common.h | 10 ++ 2 files changed, 167 insertions(+) diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d09b1ded5287..54d0931801e1 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -387,3 +387,160 @@ static __always_inline unsigned long save_and_switch_to_kernel_cr3(void) static __always_inline void restore_cr3(unsigned long cr3) {} #endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +/* + * "Paranoid" entry path from exception stack. Ensure that the CR3 and + * GS registers are correctly set for the kernel. Return GSBASE related + * information in kernel_entry_state depending on the availability of + * the FSGSBASE instructions: + * + * FSGSBASE kernel_entry_state + * N swapgs=true -> SWAPGS on exit + * swapgs=false -> no SWAPGS on exit + * + * Y gsbase=GSBASE value at entry, must be restored in + * kernel_paranoid_exit() + * + * Note that per-cpu variables are accessed using the GS register, + * so paranoid entry code cannot access per-cpu variables before + * kernel_paranoid_entry() has been called. + */ +noinstr void kernel_paranoid_entry(struct kernel_entry_state *state) +{ + unsigned long gsbase; + unsigned int cpu; + + /* + * Save CR3 in the kernel entry state. This value will be + * restored, verbatim, at exit. Needed if the paranoid entry + * interrupted another entry that already switched to the user + * CR3 value but has not yet returned to userspace. + * + * This is also why CS (stashed in the "iret frame" by the + * hardware at entry) can not be used: this may be a return + * to kernel code, but with a user CR3 value. + * + * Switching CR3 does not depend on kernel GSBASE so it can + * be done before switching to the kernel GSBASE. This is + * required for FSGSBASE because the kernel GSBASE has to + * be retrieved from a kernel internal table. + */ + state->cr3 = save_and_switch_to_kernel_cr3(); + + /* + * Handling GSBASE depends on the availability of FSGSBASE. + * + * Without FSGSBASE the kernel enforces that negative GSBASE + * values indicate kernel GSBASE. With FSGSBASE no assumptions + * can be made about the GSBASE value when entering from user + * space. + */ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + /* + * Read the current GSBASE and store it in the kernel + * entry state unconditionally, retrieve and set the + * current CPUs kernel GSBASE. The stored value has to + * be restored at exit unconditionally. + * + * The unconditional write to GS base below ensures that + * no subsequent loads based on a mispredicted GS base + * can happen, therefore no LFENCE is needed here. + */ + state->gsbase = rdgsbase(); + + /* + * Fetch the per-CPU GSBASE value for this processor. We + * normally use %gs for accessing per-CPU data, but we + * are setting up %gs here and obviously can not use %gs + * itself to access per-CPU data. + */ + if (IS_ENABLED(CONFIG_SMP)) { + /* + * Load CPU from the GDT. Do not use RDPID, + * because KVM loads guest's TSC_AUX on vm-entry + * and may not restore the host's value until + * the CPU returns to userspace. Thus the kernel + * would consume a guest's TSC_AUX if an NMI + * arrives while running KVM's run loop. + */ + asm_inline volatile ("lsl %[seg],%[p]" + : [p] "=r" (cpu) + : [seg] "r" (__CPUNODE_SEG)); + + cpu &= VDSO_CPUNODE_MASK; + gsbase = __per_cpu_offset[cpu]; + } else { + gsbase = *pcpu_unit_offsets; + } + + wrgsbase(gsbase); + + } else { + /* + * The kernel-enforced convention is a negative GSBASE + * indicates a kernel value. No SWAPGS needed on entry + * and exit. + */ + rdmsrl(MSR_GS_BASE, gsbase); + if (((long)gsbase) >= 0) { + swapgs(); + /* + * Do an lfence to prevent GS speculation. + */ + alternative("", "lfence", + X86_FEATURE_FENCE_SWAPGS_KERNEL); + state->swapgs = true; + } else { + state->swapgs = false; + } + } +} + +/* + * "Paranoid" exit path from exception stack. Restore the CR3 and + * GS registers are as they were on entry. This is invoked only + * on return from IST interrupts that came from kernel space. + * + * We may be returning to very strange contexts (e.g. very early + * in syscall entry), so checking for preemption here would + * be complicated. Fortunately, there's no good reason to try + * to handle preemption here. + * + * The kernel_entry_state contains the GSBASE related information + * depending on the availability of the FSGSBASE instructions: + * + * FSGSBASE kernel_entry_state + * N swapgs=true -> SWAPGS on exit + * swapgs=false -> no SWAPGS on exit + * + * Y gsbase=GSBASE value at entry, must be restored + * unconditionally + * + * Note that per-cpu variables are accessed using the GS register, + * so paranoid entry code cannot access per-cpu variables after + * kernel_paranoid_exit() has been called. + */ +noinstr void kernel_paranoid_exit(struct kernel_entry_state *state) +{ + /* + * The order of operations is important. RESTORE_CR3 requires + * kernel GSBASE. + * + * NB to anyone to try to optimize this code: this code does + * not execute at all for exceptions from user mode. Those + * exceptions go through error_exit instead. + */ + restore_cr3(state->cr3); + + /* With FSGSBASE enabled, unconditionally restore GSBASE */ + if (static_cpu_has(X86_FEATURE_FSGSBASE)) { + wrgsbase(state->gsbase); + return; + } + + /* On non-FSGSBASE systems, conditionally do SWAPGS */ + if (state->swapgs) { + /* We are returning to a context with user GSBASE */ + swapgs_unsafe_stack(); + } +} diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h index b05b212f5ebc..b75e9230c990 100644 --- a/arch/x86/include/asm/entry-common.h +++ b/arch/x86/include/asm/entry-common.h @@ -163,6 +163,16 @@ static inline void switch_to_kernel_cr3(void) {} static inline void switch_to_user_cr3(void) {} #endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +struct kernel_entry_state { + unsigned long cr3; + unsigned long gsbase; + bool swapgs; +}; + +void kernel_paranoid_entry(struct kernel_entry_state *state); +void kernel_paranoid_exit(struct kernel_entry_state *state); + #endif /* MODULE */ #endif -- 2.18.4