exit

Alexandre Chartre Mon, 09 Nov 2020 03:24:20 -0800

paranoid_entry/exit are assembly macros. Provide C versions of
these macros (kernel_paranoid_entry() and kernel_paranoid_exit()).
The C functions are functionally equivalent to the assembly macros,
except that kernel_paranoid_entry() doesn't save registers in
pt_regs like paranoid_entry does.


Signed-off-by: Alexandre Chartre <alexandre.char...@oracle.com>
---
 arch/x86/entry/common.c             | 157 ++++++++++++++++++++++++++++
 arch/x86/include/asm/entry-common.h |  10 ++
 2 files changed, 167 insertions(+)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index d09b1ded5287..54d0931801e1 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -387,3 +387,160 @@ static __always_inline unsigned long 
save_and_switch_to_kernel_cr3(void)
 static __always_inline void restore_cr3(unsigned long cr3) {}
 
 #endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * "Paranoid" entry path from exception stack. Ensure that the CR3 and
+ * GS registers are correctly set for the kernel. Return GSBASE related
+ * information in kernel_entry_state depending on the availability of
+ * the FSGSBASE instructions:
+ *
+ * FSGSBASE    kernel_entry_state
+ *     N        swapgs=true -> SWAPGS on exit
+ *              swapgs=false -> no SWAPGS on exit
+ *
+ *     Y        gsbase=GSBASE value at entry, must be restored in
+ *              kernel_paranoid_exit()
+ *
+ * Note that per-cpu variables are accessed using the GS register,
+ * so paranoid entry code cannot access per-cpu variables before
+ * kernel_paranoid_entry() has been called.
+ */
+noinstr void kernel_paranoid_entry(struct kernel_entry_state *state)
+{
+       unsigned long gsbase;
+       unsigned int cpu;
+
+       /*
+        * Save CR3 in the kernel entry state.  This value will be
+        * restored, verbatim, at exit.  Needed if the paranoid entry
+        * interrupted another entry that already switched to the user
+        * CR3 value but has not yet returned to userspace.
+        *
+        * This is also why CS (stashed in the "iret frame" by the
+        * hardware at entry) can not be used: this may be a return
+        * to kernel code, but with a user CR3 value.
+        *
+        * Switching CR3 does not depend on kernel GSBASE so it can
+        * be done before switching to the kernel GSBASE. This is
+        * required for FSGSBASE because the kernel GSBASE has to
+        * be retrieved from a kernel internal table.
+        */
+       state->cr3 = save_and_switch_to_kernel_cr3();
+
+       /*
+        * Handling GSBASE depends on the availability of FSGSBASE.
+        *
+        * Without FSGSBASE the kernel enforces that negative GSBASE
+        * values indicate kernel GSBASE. With FSGSBASE no assumptions
+        * can be made about the GSBASE value when entering from user
+        * space.
+        */
+       if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+               /*
+                * Read the current GSBASE and store it in the kernel
+                * entry state unconditionally, retrieve and set the
+                * current CPUs kernel GSBASE. The stored value has to
+                * be restored at exit unconditionally.
+                *
+                * The unconditional write to GS base below ensures that
+                * no subsequent loads based on a mispredicted GS base
+                * can happen, therefore no LFENCE is needed here.
+                */
+               state->gsbase = rdgsbase();
+
+               /*
+                * Fetch the per-CPU GSBASE value for this processor. We
+                * normally use %gs for accessing per-CPU data, but we
+                * are setting up %gs here and obviously can not use %gs
+                * itself to access per-CPU data.
+                */
+               if (IS_ENABLED(CONFIG_SMP)) {
+                       /*
+                        * Load CPU from the GDT. Do not use RDPID,
+                        * because KVM loads guest's TSC_AUX on vm-entry
+                        * and may not restore the host's value until
+                        * the CPU returns to userspace. Thus the kernel
+                        * would consume a guest's TSC_AUX if an NMI
+                        * arrives while running KVM's run loop.
+                        */
+                       asm_inline volatile ("lsl %[seg],%[p]"
+                                            : [p] "=r" (cpu)
+                                            : [seg] "r" (__CPUNODE_SEG));
+
+                       cpu &= VDSO_CPUNODE_MASK;
+                       gsbase = __per_cpu_offset[cpu];
+               } else {
+                       gsbase = *pcpu_unit_offsets;
+               }
+
+               wrgsbase(gsbase);
+
+       } else {
+               /*
+                * The kernel-enforced convention is a negative GSBASE
+                * indicates a kernel value. No SWAPGS needed on entry
+                * and exit.
+                */
+               rdmsrl(MSR_GS_BASE, gsbase);
+               if (((long)gsbase) >= 0) {
+                       swapgs();
+                       /*
+                        * Do an lfence to prevent GS speculation.
+                        */
+                       alternative("", "lfence",
+                                   X86_FEATURE_FENCE_SWAPGS_KERNEL);
+                       state->swapgs = true;
+               } else {
+                       state->swapgs = false;
+               }
+       }
+}
+
+/*
+ * "Paranoid" exit path from exception stack. Restore the CR3 and
+ * GS registers are as they were on entry. This is invoked only
+ * on return from IST interrupts that came from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated.  Fortunately, there's no good reason to try
+ * to handle preemption here.
+ *
+ * The kernel_entry_state contains the GSBASE related information
+ * depending on the availability of the FSGSBASE instructions:
+ *
+ * FSGSBASE    kernel_entry_state
+ *     N        swapgs=true  -> SWAPGS on exit
+ *              swapgs=false -> no SWAPGS on exit
+ *
+ *     Y        gsbase=GSBASE value at entry, must be restored
+ *              unconditionally
+ *
+ * Note that per-cpu variables are accessed using the GS register,
+ * so paranoid entry code cannot access per-cpu variables after
+ * kernel_paranoid_exit() has been called.
+ */
+noinstr void kernel_paranoid_exit(struct kernel_entry_state *state)
+{
+       /*
+        * The order of operations is important. RESTORE_CR3 requires
+        * kernel GSBASE.
+        *
+        * NB to anyone to try to optimize this code: this code does
+        * not execute at all for exceptions from user mode. Those
+        * exceptions go through error_exit instead.
+        */
+       restore_cr3(state->cr3);
+
+       /* With FSGSBASE enabled, unconditionally restore GSBASE */
+       if (static_cpu_has(X86_FEATURE_FSGSBASE)) {
+               wrgsbase(state->gsbase);
+               return;
+       }
+
+       /* On non-FSGSBASE systems, conditionally do SWAPGS */
+       if (state->swapgs) {
+               /* We are returning to a context with user GSBASE */
+               swapgs_unsafe_stack();
+       }
+}
diff --git a/arch/x86/include/asm/entry-common.h 
b/arch/x86/include/asm/entry-common.h
index b05b212f5ebc..b75e9230c990 100644
--- a/arch/x86/include/asm/entry-common.h
+++ b/arch/x86/include/asm/entry-common.h
@@ -163,6 +163,16 @@ static inline void switch_to_kernel_cr3(void) {}
 static inline void switch_to_user_cr3(void) {}
 
 #endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+struct kernel_entry_state {
+       unsigned long cr3;
+       unsigned long gsbase;
+       bool swapgs;
+};
+
+void kernel_paranoid_entry(struct kernel_entry_state *state);
+void kernel_paranoid_exit(struct kernel_entry_state *state);
+
 #endif /* MODULE */
 
 #endif
-- 
2.18.4

[RFC][PATCH 09/24] x86/entry: Add C version of paranoid_entry/exit

Reply via email to