4.9-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Hugh Dickins <hu...@google.com>


Merged performance improvements to Kaiser, using distinct kernel
and user Process Context Identifiers to minimize the TLB flushing.

[This work actually all from Dave Hansen 2017-08-30:
still omitting trackswitch mods, and KAISER_REAL_SWITCH deleted.]

Signed-off-by: Hugh Dickins <hu...@google.com>
Signed-off-by: Greg Kroah-Hartman <gre...@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S                   |   10 ++++-
 arch/x86/entry/entry_64_compat.S            |    1 
 arch/x86/include/asm/cpufeatures.h          |    1 
 arch/x86/include/asm/kaiser.h               |   15 ++++++-
 arch/x86/include/asm/pgtable_types.h        |   26 +++++++++++++
 arch/x86/include/asm/tlbflush.h             |   54 +++++++++++++++++++++++-----
 arch/x86/include/uapi/asm/processor-flags.h |    3 +
 arch/x86/kernel/cpu/common.c                |   34 +++++++++++++++++
 arch/x86/kvm/x86.c                          |    3 +
 arch/x86/mm/kaiser.c                        |    7 +++
 arch/x86/mm/tlb.c                           |   46 ++++++++++++++++++++++-
 11 files changed, 182 insertions(+), 18 deletions(-)

--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1317,7 +1317,10 @@ ENTRY(nmi)
        /* %rax is saved above, so OK to clobber here */
        movq    %cr3, %rax
        pushq   %rax
-       andq    $(~KAISER_SHADOW_PGD_OFFSET), %rax
+       /* mask off "user" bit of pgd address and 12 PCID bits: */
+       andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+       /* Add back kernel PCID and "no flush" bit */
+       orq     X86_CR3_PCID_KERN_VAR, %rax
        movq    %rax, %cr3
 #endif
        call    do_nmi
@@ -1558,7 +1561,10 @@ end_repeat_nmi:
        /* %rax is saved above, so OK to clobber here */
        movq    %cr3, %rax
        pushq   %rax
-       andq    $(~KAISER_SHADOW_PGD_OFFSET), %rax
+       /* mask off "user" bit of pgd address and 12 PCID bits: */
+       andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+       /* Add back kernel PCID and "no flush" bit */
+       orq     X86_CR3_PCID_KERN_VAR, %rax
        movq    %rax, %cr3
 #endif
 
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,7 @@
 #include <asm/irqflags.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
+#include <asm/pgtable_types.h>
 #include <asm/kaiser.h>
 #include <linux/linkage.h>
 #include <linux/err.h>
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -189,6 +189,7 @@
 
 #define X86_FEATURE_CPB                ( 7*32+ 2) /* AMD Core Performance 
Boost */
 #define X86_FEATURE_EPB                ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS 
support */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && 
CR4.PCIDE=1 */
 
 #define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -1,5 +1,8 @@
 #ifndef _ASM_X86_KAISER_H
 #define _ASM_X86_KAISER_H
+
+#include <uapi/asm/processor-flags.h> /* For PCID constants */
+
 /*
  * This file includes the definitions for the KAISER feature.
  * KAISER is a counter measure against x86_64 side channel attacks on
@@ -21,13 +24,21 @@
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
-andq $(~KAISER_SHADOW_PGD_OFFSET), \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+orq  X86_CR3_PCID_KERN_VAR, \reg
 movq \reg, %cr3
 .endm
 
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
-orq $(KAISER_SHADOW_PGD_OFFSET), \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+/*
+ * This can obviously be one instruction by putting the
+ * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR.
+ * But, just leave it now for simplicity.
+ */
+orq  X86_CR3_PCID_USER_VAR, \reg
+orq  $(KAISER_SHADOW_PGD_OFFSET), \reg
 movq \reg, %cr3
 .endm
 
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -141,6 +141,32 @@
                         _PAGE_SOFT_DIRTY)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x4,UL))
+#define X86_CR3_PCID_ASID_USER  (_AC(0x6,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH                (X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH                (X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH      (X86_CR3_PCID_NOFLUSH | 
X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH      (X86_CR3_PCID_NOFLUSH | 
X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
+#define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH                (0)
+#define X86_CR3_PCID_USER_FLUSH                (0)
+#define X86_CR3_PCID_KERN_NOFLUSH      (0)
+#define X86_CR3_PCID_USER_NOFLUSH      (0)
+#endif
+
 /*
  * The cache modes defined here are used to translate between pure SW usage
  * and the HW defined cache mode bits and/or PAT entries.
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -13,7 +13,6 @@ static inline void __invpcid(unsigned lo
                             unsigned long type)
 {
        struct { u64 d[2]; } desc = { { pcid, addr } };
-
        /*
         * The memory clobber is because the whole point is to invalidate
         * stale TLB entries and, especially if we're flushing global
@@ -134,14 +133,25 @@ static inline void cr4_set_bits_and_upda
 
 static inline void __native_flush_tlb(void)
 {
+       if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
+               /*
+                * If current->mm == NULL then we borrow a mm which may change 
during a
+                * task switch and therefore we must not be preempted while we 
write CR3
+                * back:
+                */
+               preempt_disable();
+               native_write_cr3(native_read_cr3());
+               preempt_enable();
+               return;
+       }
        /*
-        * If current->mm == NULL then we borrow a mm which may change during a
-        * task switch and therefore we must not be preempted while we write CR3
-        * back:
-        */
-       preempt_disable();
-       native_write_cr3(native_read_cr3());
-       preempt_enable();
+        * We are no longer using globals with KAISER, so a
+        * "nonglobals" flush would work too. But, this is more
+        * conservative.
+        *
+        * Note, this works with CR4.PCIDE=0 or 1.
+        */
+       invpcid_flush_all();
 }
 
 static inline void __native_flush_tlb_global_irq_disabled(void)
@@ -163,6 +173,8 @@ static inline void __native_flush_tlb_gl
                /*
                 * Using INVPCID is considerably faster than a pair of writes
                 * to CR4 sandwiched inside an IRQ flag save/restore.
+                *
+                * Note, this works with CR4.PCIDE=0 or 1.
                 */
                invpcid_flush_all();
                return;
@@ -182,7 +194,31 @@ static inline void __native_flush_tlb_gl
 
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
-       asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+       /*
+        * SIMICS #GP's if you run INVPCID with type 2/3
+        * and X86_CR4_PCIDE clear.  Shame!
+        *
+        * The ASIDs used below are hard-coded.  But, we must not
+        * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call
+        * invpcid in the case we are called early.
+        */
+       if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+               asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+               return;
+       }
+       /* Flush the address out of both PCIDs. */
+       /*
+        * An optimization here might be to determine addresses
+        * that are only kernel-mapped and only flush the kernel
+        * ASID.  But, userspace flushes are probably much more
+        * important performance-wise.
+        *
+        * Make sure to do only a single invpcid when KAISER is
+        * disabled and we have only a single ASID.
+        */
+       if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
+               invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+       invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
 }
 
 static inline void __flush_tlb_all(void)
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -77,7 +77,8 @@
 #define X86_CR3_PWT            _BITUL(X86_CR3_PWT_BIT)
 #define X86_CR3_PCD_BIT                4 /* Page Cache Disable */
 #define X86_CR3_PCD            _BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK      _AC(0x00000fff,UL) /* PCID Mask */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
 
 /*
  * Intel CPU features in CR4
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -324,11 +324,45 @@ static __always_inline void setup_smap(s
        }
 }
 
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3.  It would take
+ * another register.  So, we use a memory reference to these
+ * instead.
+ *
+ * This is also handy because systems that do not support
+ * PCIDs just end up or'ing a 0 into their CR3, which does
+ * no harm.
+ */
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0;
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0;
+
 static void setup_pcid(struct cpuinfo_x86 *c)
 {
        if (cpu_has(c, X86_FEATURE_PCID)) {
                if (cpu_has(c, X86_FEATURE_PGE)) {
                        cr4_set_bits(X86_CR4_PCIDE);
+                       /*
+                        * These variables are used by the entry/exit
+                        * code to change PCIDs.
+                        */
+#ifdef CONFIG_KAISER
+                       X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH;
+                       X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH;
+#endif
+                       /*
+                        * INVPCID has two "groups" of types:
+                        * 1/2: Invalidate an individual address
+                        * 3/4: Invalidate all contexts
+                        *
+                        * 1/2 take a PCID, but 3/4 do not.  So, 3/4
+                        * ignore the PCID argument in the descriptor.
+                        * But, we have to be careful not to call 1/2
+                        * with an actual non-zero PCID in them before
+                        * we do the above cr4_set_bits().
+                        */
+                       if (cpu_has(c, X86_FEATURE_INVPCID))
+                               set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
                } else {
                        /*
                         * flush_tlb_all(), as currently implemented, won't
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, u
                        return 1;
 
                /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
-               if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || 
!is_long_mode(vcpu))
+               if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
+                   !is_long_mode(vcpu))
                        return 1;
        }
 
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -239,6 +239,8 @@ static void __init kaiser_init_all_pgds(
 } while (0)
 
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+extern unsigned long X86_CR3_PCID_KERN_VAR;
+extern unsigned long X86_CR3_PCID_USER_VAR;
 /*
  * If anything in here fails, we will likely die on one of the
  * first kernel->user transitions and init will die.  But, we
@@ -289,6 +291,11 @@ void __init kaiser_init(void)
        kaiser_add_user_map_early(&debug_idt_table,
                                  sizeof(gate_desc) * NR_VECTORS,
                                  __PAGE_KERNEL);
+
+       kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
+                                 __PAGE_KERNEL);
+       kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE,
+                                 __PAGE_KERNEL);
 }
 
 /* Add a mapping to the shadow mapping, and synchronize the mappings */
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -34,6 +34,46 @@ struct flush_tlb_info {
        unsigned long flush_end;
 };
 
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+       unsigned long new_mm_cr3 = __pa(pgdir);
+
+       /*
+        * KAISER, plus PCIDs needs some extra work here.  But,
+        * if either of features is not present, we need no
+        * PCIDs here and just do a normal, full TLB flush with
+        * the write_cr3()
+        */
+       if (!IS_ENABLED(CONFIG_KAISER) ||
+           !cpu_feature_enabled(X86_FEATURE_PCID))
+               goto out_set_cr3;
+       /*
+        * We reuse the same PCID for different tasks, so we must
+        * flush all the entires for the PCID out when we change
+        * tasks.
+        */
+       new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir);
+
+       /*
+        * The flush from load_cr3() may leave old TLB entries
+        * for userspace in place.  We must flush that context
+        * separately.  We can theoretically delay doing this
+        * until we actually load up the userspace CR3, but
+        * that's a bit tricky.  We have to have the "need to
+        * flush userspace PCID" bit per-cpu and check it in the
+        * exit-to-userspace paths.
+        */
+       invpcid_flush_single_context(X86_CR3_PCID_ASID_USER);
+
+out_set_cr3:
+       /*
+        * Caution: many callers of this function expect
+        * that load_cr3() is serializing and orders TLB
+        * fills with respect to the mm_cpumask writes.
+        */
+       write_cr3(new_mm_cr3);
+}
+
 /*
  * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
@@ -45,7 +85,7 @@ void leave_mm(int cpu)
                BUG();
        if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
                cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
-               load_cr3(swapper_pg_dir);
+               load_new_mm_cr3(swapper_pg_dir);
                /*
                 * This gets called in the idle path where RCU
                 * functions differently.  Tracing normally
@@ -120,7 +160,7 @@ void switch_mm_irqs_off(struct mm_struct
                 * ordering guarantee we need.
                 *
                 */
-               load_cr3(next->pgd);
+               load_new_mm_cr3(next->pgd);
 
                trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 
@@ -167,7 +207,7 @@ void switch_mm_irqs_off(struct mm_struct
                         * As above, load_cr3() is serializing and orders TLB
                         * fills with respect to the mm_cpumask write.
                         */
-                       load_cr3(next->pgd);
+                       load_new_mm_cr3(next->pgd);
                        trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 
TLB_FLUSH_ALL);
                        load_mm_cr4(next);
                        load_mm_ldt(next);


Reply via email to