Re: Linux 4.9.75

Greg KH Fri, 05 Jan 2018 06:56:30 -0800
diff --git a/Documentation/kernel-parameters.txt 
b/Documentation/kernel-parameters.txt
index 152ec4e87b57..5d2676d043de 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2763,6 +2763,8 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
 
        nojitter        [IA-64] Disables jitter checking for ITC timers.
 
+       nopti           [X86-64] Disable KAISER isolation of kernel from user.
+
        no-kvmclock     [X86,KVM] Disable paravirtualized KVM clock driver
 
        no-kvmapf       [X86,KVM] Disable paravirtualized asynchronous page
@@ -3325,6 +3327,12 @@ bytes respectively. Such letter suffixes can also be 
entirely omitted.
        pt.             [PARIDE]
                        See Documentation/blockdev/paride.txt.
 
+       pti=            [X86_64]
+                       Control KAISER user/kernel address space isolation:
+                       on - enable
+                       off - disable
+                       auto - default setting
+
        pty.legacy_count=
                        [KNL] Number of legacy pty's. Overwrites compiled-in
                        default number.
diff --git a/Makefile b/Makefile
index 075e429732e7..acbc1b032db2 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,6 @@
 VERSION = 4
 PATCHLEVEL = 9
-SUBLEVEL = 74
+SUBLEVEL = 75
 EXTRAVERSION =
 NAME = Roaring Lionus
 
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 766a5211f827..2728e1b7e4a6 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -9,6 +9,7 @@
  */
 #undef CONFIG_PARAVIRT
 #undef CONFIG_PARAVIRT_SPINLOCKS
+#undef CONFIG_PAGE_TABLE_ISOLATION
 #undef CONFIG_KASAN
 
 #include <linux/linkage.h>
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index e7b0e7ff4c58..af4e58132d91 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,6 +36,7 @@
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
 #include <asm/export.h>
+#include <asm/kaiser.h>
 #include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
@@ -146,6 +147,7 @@ ENTRY(entry_SYSCALL_64)
         * it is too small to ever cause noticeable irq latency.
         */
        SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
        /*
         * A hypervisor implementation might want to use a label
         * after the swapgs, so that it can do the swapgs
@@ -228,6 +230,14 @@ entry_SYSCALL_64_fastpath:
        movq    RIP(%rsp), %rcx
        movq    EFLAGS(%rsp), %r11
        RESTORE_C_REGS_EXCEPT_RCX_R11
+       /*
+        * This opens a window where we have a user CR3, but are
+        * running in the kernel.  This makes using the CS
+        * register useless for telling whether or not we need to
+        * switch CR3 in NMIs.  Normal interrupts are OK because
+        * they are off here.
+        */
+       SWITCH_USER_CR3
        movq    RSP(%rsp), %rsp
        USERGS_SYSRET64
 
@@ -323,10 +333,26 @@ return_from_SYSCALL_64:
 syscall_return_via_sysret:
        /* rcx and r11 are already restored (see code above) */
        RESTORE_C_REGS_EXCEPT_RCX_R11
+       /*
+        * This opens a window where we have a user CR3, but are
+        * running in the kernel.  This makes using the CS
+        * register useless for telling whether or not we need to
+        * switch CR3 in NMIs.  Normal interrupts are OK because
+        * they are off here.
+        */
+       SWITCH_USER_CR3
        movq    RSP(%rsp), %rsp
        USERGS_SYSRET64
 
 opportunistic_sysret_failed:
+       /*
+        * This opens a window where we have a user CR3, but are
+        * running in the kernel.  This makes using the CS
+        * register useless for telling whether or not we need to
+        * switch CR3 in NMIs.  Normal interrupts are OK because
+        * they are off here.
+        */
+       SWITCH_USER_CR3
        SWAPGS
        jmp     restore_c_regs_and_iret
 END(entry_SYSCALL_64)
@@ -424,6 +450,7 @@ ENTRY(ret_from_fork)
        movq    %rsp, %rdi
        call    syscall_return_slowpath /* returns with IRQs disabled */
        TRACE_IRQS_ON                   /* user mode is traced as IRQS on */
+       SWITCH_USER_CR3
        SWAPGS
        jmp     restore_regs_and_iret
 
@@ -478,6 +505,7 @@ END(irq_entries_start)
         * tracking that we're in kernel mode.
         */
        SWAPGS
+       SWITCH_KERNEL_CR3
 
        /*
         * We need to tell lockdep that IRQs are off.  We can't do this until
@@ -535,6 +563,7 @@ GLOBAL(retint_user)
        mov     %rsp,%rdi
        call    prepare_exit_to_usermode
        TRACE_IRQS_IRETQ
+       SWITCH_USER_CR3
        SWAPGS
        jmp     restore_regs_and_iret
 
@@ -612,6 +641,7 @@ native_irq_return_ldt:
 
        pushq   %rdi                            /* Stash user RDI */
        SWAPGS
+       SWITCH_KERNEL_CR3
        movq    PER_CPU_VAR(espfix_waddr), %rdi
        movq    %rax, (0*8)(%rdi)               /* user RAX */
        movq    (1*8)(%rsp), %rax               /* user RIP */
@@ -638,6 +668,7 @@ native_irq_return_ldt:
         * still points to an RO alias of the ESPFIX stack.
         */
        orq     PER_CPU_VAR(espfix_stack), %rax
+       SWITCH_USER_CR3
        SWAPGS
        movq    %rax, %rsp
 
@@ -1022,7 +1053,11 @@ idtentry machine_check                                   
has_error_code=0        paranoid=1 do_sym=*machine_check_vec
 /*
  * Save all registers in pt_regs, and switch gs if needed.
  * Use slow, but surefire "are we in kernel?" check.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ *
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
  */
 ENTRY(paranoid_entry)
        cld
@@ -1035,7 +1070,26 @@ ENTRY(paranoid_entry)
        js      1f                              /* negative -> in kernel */
        SWAPGS
        xorl    %ebx, %ebx
-1:     ret
+1:
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+       /*
+        * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+        * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+        * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+        * unconditionally, but we need to find out whether the reverse
+        * should be done on return (conveyed to paranoid_exit in %ebx).
+        */
+       ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+       testl   $KAISER_SHADOW_PGD_OFFSET, %eax
+       jz      2f
+       orl     $2, %ebx
+       andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+       /* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+       ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+       movq    %rax, %cr3
+2:
+#endif
+       ret
 END(paranoid_entry)
 
 /*
@@ -1048,19 +1102,26 @@ END(paranoid_entry)
  * be complicated.  Fortunately, we there's no good reason
  * to try to handle preemption here.
  *
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ *           ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ *           ebx=2: needs both swapgs and SWITCH_USER_CR3
+ *           ebx=3: needs SWITCH_USER_CR3 but not swapgs
  */
 ENTRY(paranoid_exit)
        DISABLE_INTERRUPTS(CLBR_NONE)
        TRACE_IRQS_OFF_DEBUG
-       testl   %ebx, %ebx                      /* swapgs needed? */
+       TRACE_IRQS_IRETQ_DEBUG
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+       /* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
+       testl   $2, %ebx                        /* SWITCH_USER_CR3 needed? */
+       jz      paranoid_exit_no_switch
+       SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+       testl   $1, %ebx                        /* swapgs needed? */
        jnz     paranoid_exit_no_swapgs
-       TRACE_IRQS_IRETQ
        SWAPGS_UNSAFE_STACK
-       jmp     paranoid_exit_restore
 paranoid_exit_no_swapgs:
-       TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
        RESTORE_EXTRA_REGS
        RESTORE_C_REGS
        REMOVE_PT_GPREGS_FROM_STACK 8
@@ -1075,6 +1136,13 @@ ENTRY(error_entry)
        cld
        SAVE_C_REGS 8
        SAVE_EXTRA_REGS 8
+       /*
+        * error_entry() always returns with a kernel gsbase and
+        * CR3.  We must also have a kernel CR3/gsbase before
+        * calling TRACE_IRQS_*.  Just unconditionally switch to
+        * the kernel CR3 here.
+        */
+       SWITCH_KERNEL_CR3
        xorl    %ebx, %ebx
        testb   $3, CS+8(%rsp)
        jz      .Lerror_kernelspace
@@ -1235,6 +1303,10 @@ ENTRY(nmi)
         */
 
        SWAPGS_UNSAFE_STACK
+       /*
+        * percpu variables are mapped with user CR3, so no need
+        * to switch CR3 here.
+        */
        cld
        movq    %rsp, %rdx
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1268,12 +1340,34 @@ ENTRY(nmi)
 
        movq    %rsp, %rdi
        movq    $-1, %rsi
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+       /* Unconditionally use kernel CR3 for do_nmi() */
+       /* %rax is saved above, so OK to clobber here */
+       ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+       /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+       ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+       pushq   %rax
+       /* mask off "user" bit of pgd address and 12 PCID bits: */
+       andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+       movq    %rax, %cr3
+2:
+#endif
        call    do_nmi
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+       /*
+        * Unconditionally restore CR3.  I know we return to
+        * kernel code that needs user CR3, but do we ever return
+        * to "user mode" where we need the kernel CR3?
+        */
+       ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
+#endif
+
        /*
         * Return back to user mode.  We must *not* do the normal exit
-        * work, because we don't want to enable interrupts.  Fortunately,
-        * do_nmi doesn't modify pt_regs.
+        * work, because we don't want to enable interrupts.  Do not
+        * switch to user CR3: we might be going back to kernel code
+        * that had a user CR3 set.
         */
        SWAPGS
        jmp     restore_c_regs_and_iret
@@ -1470,22 +1564,55 @@ end_repeat_nmi:
        ALLOC_PT_GPREGS_ON_STACK
 
        /*
-        * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
-        * as we should not be calling schedule in NMI context.
-        * Even with normal interrupts enabled. An NMI should not be
-        * setting NEED_RESCHED or anything that normal interrupts and
-        * exceptions might do.
+        * Use the same approach as paranoid_entry to handle SWAPGS, but
+        * without CR3 handling since we do that differently in NMIs.  No
+        * need to use paranoid_exit as we should not be calling schedule
+        * in NMI context.  Even with normal interrupts enabled. An NMI
+        * should not be setting NEED_RESCHED or anything that normal
+        * interrupts and exceptions might do.
         */
-       call    paranoid_entry
-
-       /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
+       cld
+       SAVE_C_REGS
+       SAVE_EXTRA_REGS
+       movl    $1, %ebx
+       movl    $MSR_GS_BASE, %ecx
+       rdmsr
+       testl   %edx, %edx
+       js      1f                              /* negative -> in kernel */
+       SWAPGS
+       xorl    %ebx, %ebx
+1:
        movq    %rsp, %rdi
        movq    $-1, %rsi
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+       /* Unconditionally use kernel CR3 for do_nmi() */
+       /* %rax is saved above, so OK to clobber here */
+       ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
+       /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+       ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
+       pushq   %rax
+       /* mask off "user" bit of pgd address and 12 PCID bits: */
+       andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+       movq    %rax, %cr3
+2:
+#endif
+
+       /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
        call    do_nmi
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+       /*
+        * Unconditionally restore CR3.  We might be returning to
+        * kernel code that needs user CR3, like just just before
+        * a sysret.
+        */
+       ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
+#endif
+
        testl   %ebx, %ebx                      /* swapgs needed? */
        jnz     nmi_restore
 nmi_swapgs:
+       /* We fixed up CR3 above, so no need to switch it here */
        SWAPGS_UNSAFE_STACK
 nmi_restore:
        RESTORE_EXTRA_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e1721dafbcb1..d76a97653980 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,8 @@
 #include <asm/irqflags.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
+#include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
 #include <linux/linkage.h>
 #include <linux/err.h>
 
@@ -48,6 +50,7 @@
 ENTRY(entry_SYSENTER_compat)
        /* Interrupts are off on entry. */
        SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
        movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
        /*
@@ -184,6 +187,7 @@ ENDPROC(entry_SYSENTER_compat)
 ENTRY(entry_SYSCALL_compat)
        /* Interrupts are off on entry. */
        SWAPGS_UNSAFE_STACK
+       SWITCH_KERNEL_CR3_NO_STACK
 
        /* Stash user ESP and switch to the kernel stack. */
        movl    %esp, %r8d
@@ -259,6 +263,7 @@ sysret32_from_system_call:
        xorq    %r8, %r8
        xorq    %r9, %r9
        xorq    %r10, %r10
+       SWITCH_USER_CR3
        movq    RSP-ORIG_RAX(%rsp), %rsp
        swapgs
        sysretl
@@ -297,7 +302,7 @@ ENTRY(entry_INT80_compat)
        PARAVIRT_ADJUST_EXCEPTION_FRAME
        ASM_CLAC                        /* Do this early to minimize exposure */
        SWAPGS
-
+       SWITCH_KERNEL_CR3_NO_STACK
        /*
         * User tracing code (ptrace or signal handlers) might assume that
         * the saved RAX contains a 32-bit number when we're invoking a 32-bit
@@ -338,6 +343,7 @@ ENTRY(entry_INT80_compat)
 
        /* Go back to user mode. */
        TRACE_IRQS_ON
+       SWITCH_USER_CR3
        SWAPGS
        jmp     restore_regs_and_iret
 END(entry_INT80_compat)
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 9dfeeeca0ea8..8e7a3f1df3a5 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -2,11 +2,15 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
+#include <asm/kaiser.h>
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 
 #include "../perf_event.h"
 
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE                24
 
@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+       unsigned int order = get_order(size);
+       struct page *page;
+       unsigned long addr;
+
+       page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+       if (!page)
+               return NULL;
+       addr = (unsigned long)page_address(page);
+       if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+               __free_pages(page, order);
+               addr = 0;
+       }
+       return (void *)addr;
+#else
+       return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+       if (!buffer)
+               return;
+       kaiser_remove_mapping((unsigned long)buffer, size);
+       free_pages((unsigned long)buffer, get_order(size));
+#else
+       kfree(buffer);
+#endif
+}
+
 static int alloc_pebs_buffer(int cpu)
 {
        struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
        if (!x86_pmu.pebs)
                return 0;
 
-       buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+       buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
        if (unlikely(!buffer))
                return -ENOMEM;
 
@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
        if (x86_pmu.intel_cap.pebs_format < 2) {
                ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
                if (!ibuffer) {
-                       kfree(buffer);
+                       dsfree(buffer, x86_pmu.pebs_buffer_size);
                        return -ENOMEM;
                }
                per_cpu(insn_buffer, cpu) = ibuffer;
@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
        kfree(per_cpu(insn_buffer, cpu));
        per_cpu(insn_buffer, cpu) = NULL;
 
-       kfree((void *)(unsigned long)ds->pebs_buffer_base);
+       dsfree((void *)(unsigned long)ds->pebs_buffer_base,
+                       x86_pmu.pebs_buffer_size);
        ds->pebs_buffer_base = 0;
 }
 
@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
        if (!x86_pmu.bts)
                return 0;
 
-       buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+       buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
        if (unlikely(!buffer)) {
                WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
                return -ENOMEM;
@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
        if (!ds || !x86_pmu.bts)
                return;
 
-       kfree((void *)(unsigned long)ds->bts_buffer_base);
+       dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
        ds->bts_buffer_base = 0;
 }
 
 static int alloc_ds_buffer(int cpu)
 {
-       int node = cpu_to_node(cpu);
-       struct debug_store *ds;
-
-       ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-       if (unlikely(!ds))
-               return -ENOMEM;
+       struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
 
+       memset(ds, 0, sizeof(*ds));
        per_cpu(cpu_hw_events, cpu).ds = ds;
 
        return 0;
@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
                return;
 
        per_cpu(cpu_hw_events, cpu).ds = NULL;
-       kfree(ds);
 }
 
 void release_ds_buffers(void)
diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index e01f7f7ccb0c..84ae170bc3d0 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,5 +2,7 @@
 #define _ASM_X86_CMDLINE_H
 
 int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+                       char *buffer, int bufsize);
 
 #endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index ed10b5bf9b93..454a37adb823 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -189,6 +189,7 @@
 
 #define X86_FEATURE_CPB                ( 7*32+ 2) /* AMD Core Performance 
Boost */
 #define X86_FEATURE_EPB                ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS 
support */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && 
CR4.PCIDE=1 */
 
 #define X86_FEATURE_HW_PSTATE  ( 7*32+ 8) /* AMD HW-PState */
 #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
@@ -197,6 +198,9 @@
 #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network 
Instructions */
 #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation 
Single precision */
 
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER     ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o 
nokaiser */
+
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
 #define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 12080d87da3b..2ed5a2b3f8f7 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -43,7 +43,7 @@ struct gdt_page {
        struct desc_struct gdt[GDT_ENTRIES];
 } __attribute__((aligned(PAGE_SIZE)));
 
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index b90e1053049b..0817d63bce41 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -178,7 +178,7 @@ extern char irq_entries_start[];
 #define VECTOR_RETRIGGERED     ((void *)~0UL)
 
 typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
 
 #endif /* !ASSEMBLY_ */
 
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 000000000000..802bbbdfe143
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,141 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+#include <uapi/asm/processor-flags.h> /* For PCID constants */
+
+/*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+ * the kernel virtual memory.  It has a shadow pgd for every process: the
+ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
+ * user memory. Within a kernel context switch, or when an interrupt is 
handled,
+ * the pgd is switched to the normal one. When the system switches to user 
mode,
+ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
+ * and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user
+ * mode, such as the entry/exit functions of the user space, or the stacks.
+ */
+
+#define KAISER_SHADOW_PGD_OFFSET 0x1000
+
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg regb
+/*
+ * regb must be the low byte portion of reg: because we have arranged
+ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
+ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
+ * not enabled): so that the one register can update both memory and cr3.
+ */
+movq %cr3, \reg
+orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg
+js   9f
+/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
+movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
+9:
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+8:
+.endm
+
+.macro SWITCH_USER_CR3
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
+_SWITCH_TO_USER_CR3 %rax %al
+popq %rax
+8:
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+ALTERNATIVE "jmp 8f", \
+       __stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
+       X86_FEATURE_KAISER
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+8:
+.endm
+
+#else /* CONFIG_PAGE_TABLE_ISOLATION */
+
+.macro SWITCH_KERNEL_CR3
+.endm
+.macro SWITCH_USER_CR3
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+#else /* __ASSEMBLY__ */
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored.  To change the address space, another register is
+ * needed.  A register therefore has to be stored/restored.
+*/
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
+extern int kaiser_enabled;
+extern void __init kaiser_check_boottime_disable(void);
+#else
+#define kaiser_enabled 0
+static inline void __init kaiser_check_boottime_disable(void) {}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
+/*
+ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION 
is not set,
+ * so as to build with tests on kaiser_enabled instead of #ifdefs.
+ */
+
+/**
+ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ *  @flags: The mapping flags of the pages
+ *
+ *  The mapping is done on a global scope, so no bigger
+ *  synchronization has to be done.  the pages have to be
+ *  manually unmapped again when they are not needed any longer.
+ */
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned 
long flags);
+
+/**
+ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ *  kaiser_init - Initialize the shadow mapping
+ *
+ *  Most parts of the shadow mapping can be mapped upon boot
+ *  time.  Only per-process things like the thread stacks
+ *  or a new LDT have to be mapped at runtime.  These boot-
+ *  time mappings are permanent and never unmapped.
+ */
+extern void kaiser_init(void);
+
+#endif /* __ASSEMBLY */
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 437feb436efa..2536f90cd30c 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -18,6 +18,12 @@
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled 0
+#endif
+
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
 void ptdump_walk_pgd_level_checkwx(void);
 
@@ -690,7 +696,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long 
address)
 
 static inline int pgd_bad(pgd_t pgd)
 {
-       return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+       pgdval_t ignore_flags = _PAGE_USER;
+       /*
+        * We set NX on KAISER pgds that map userspace memory so
+        * that userspace can not meaningfully use the kernel
+        * page table by accident; it will fault on the first
+        * instruction it tries to run.  See native_set_pgd().
+        */
+       if (kaiser_enabled)
+               ignore_flags |= _PAGE_NX;
+
+       return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 
 static inline int pgd_none(pgd_t pgd)
@@ -903,7 +919,15 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  */
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
-       memcpy(dst, src, count * sizeof(pgd_t));
+       memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+       if (kaiser_enabled) {
+               /* Clone the shadow pgd part as well */
+               memcpy(native_get_shadow_pgd(dst),
+                       native_get_shadow_pgd(src),
+                       count * sizeof(pgd_t));
+       }
+#endif
 }
 
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h 
b/arch/x86/include/asm/pgtable_64.h
index 1cc82ece9ac1..ce97c8c6a310 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,9 +106,32 @@ static inline void native_pud_clear(pud_t *pud)
        native_set_pud(pud, native_make_pud(0));
 }
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+#ifdef CONFIG_DEBUG_VM
+       /* linux/mmdebug.h may not have been included at this point */
+       BUG_ON(!kaiser_enabled);
+#endif
+       return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
+}
+#else
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+       return pgd;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
+{
+       BUILD_BUG_ON(1);
+       return NULL;
+}
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
-       *pgdp = pgd;
+       *pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index 8b4de22d6429..f1c8ac468292 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -119,7 +119,7 @@
 #define _PAGE_DEVMAP   (_AT(pteval_t, 0))
 #endif
 
-#define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
 #define _PAGE_TABLE    (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |        \
                         _PAGE_ACCESSED | _PAGE_DIRTY)
@@ -137,6 +137,33 @@
                         _PAGE_SOFT_DIRTY)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
+
+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER (_AC(0x80,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH                (X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH                (X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH      (X86_CR3_PCID_NOFLUSH | 
X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH      (X86_CR3_PCID_NOFLUSH | 
X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH                (0)
+#define X86_CR3_PCID_USER_FLUSH                (0)
+#define X86_CR3_PCID_KERN_NOFLUSH      (0)
+#define X86_CR3_PCID_USER_NOFLUSH      (0)
+#endif
+
 /*
  * The cache modes defined here are used to translate between pure SW usage
  * and the HW defined cache mode bits and/or PAT entries.
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 83db0eae9979..8cb52ee3ade6 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -308,7 +308,7 @@ struct tss_struct {
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
 
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 7d2ea6b1f7d9..94146f665a3c 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -132,6 +132,24 @@ static inline void cr4_set_bits_and_update_boot(unsigned 
long mask)
        cr4_set_bits(mask);
 }
 
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+extern int kaiser_enabled;
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+#define kaiser_enabled 0
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
 static inline void __native_flush_tlb(void)
 {
        /*
@@ -140,6 +158,8 @@ static inline void __native_flush_tlb(void)
         * back:
         */
        preempt_disable();
+       if (kaiser_enabled)
+               kaiser_flush_tlb_on_return_to_user();
        native_write_cr3(native_read_cr3());
        preempt_enable();
 }
@@ -149,20 +169,27 @@ static inline void 
__native_flush_tlb_global_irq_disabled(void)
        unsigned long cr4;
 
        cr4 = this_cpu_read(cpu_tlbstate.cr4);
-       /* clear PGE */
-       native_write_cr4(cr4 & ~X86_CR4_PGE);
-       /* write old PGE again and flush TLBs */
-       native_write_cr4(cr4);
+       if (cr4 & X86_CR4_PGE) {
+               /* clear PGE and flush TLB of all entries */
+               native_write_cr4(cr4 & ~X86_CR4_PGE);
+               /* restore PGE as it was before */
+               native_write_cr4(cr4);
+       } else {
+               /* do it with cr3, letting kaiser flush user PCID */
+               __native_flush_tlb();
+       }
 }
 
 static inline void __native_flush_tlb_global(void)
 {
        unsigned long flags;
 
-       if (static_cpu_has(X86_FEATURE_INVPCID)) {
+       if (this_cpu_has(X86_FEATURE_INVPCID)) {
                /*
                 * Using INVPCID is considerably faster than a pair of writes
                 * to CR4 sandwiched inside an IRQ flag save/restore.
+                *
+                * Note, this works with CR4.PCIDE=0 or 1.
                 */
                invpcid_flush_all();
                return;
@@ -174,24 +201,45 @@ static inline void __native_flush_tlb_global(void)
         * be called from deep inside debugging code.)
         */
        raw_local_irq_save(flags);
-
        __native_flush_tlb_global_irq_disabled();
-
        raw_local_irq_restore(flags);
 }
 
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
-       asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+       /*
+        * SIMICS #GP's if you run INVPCID with type 2/3
+        * and X86_CR4_PCIDE clear.  Shame!
+        *
+        * The ASIDs used below are hard-coded.  But, we must not
+        * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call
+        * invlpg in the case we are called early.
+        */
+
+       if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+               if (kaiser_enabled)
+                       kaiser_flush_tlb_on_return_to_user();
+               asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+               return;
+       }
+       /* Flush the address out of both PCIDs. */
+       /*
+        * An optimization here might be to determine addresses
+        * that are only kernel-mapped and only flush the kernel
+        * ASID.  But, userspace flushes are probably much more
+        * important performance-wise.
+        *
+        * Make sure to do only a single invpcid when KAISER is
+        * disabled and we have only a single ASID.
+        */
+       if (kaiser_enabled)
+               invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+       invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
 }
 
 static inline void __flush_tlb_all(void)
 {
-       if (boot_cpu_has(X86_FEATURE_PGE))
-               __flush_tlb_global();
-       else
-               __flush_tlb();
-
+       __flush_tlb_global();
        /*
         * Note: if we somehow had PCID but not PGE, then this wouldn't work --
         * we'd end up flushing kernel translations for the current ASID but
diff --git a/arch/x86/include/uapi/asm/processor-flags.h 
b/arch/x86/include/uapi/asm/processor-flags.h
index 567de50a4c2a..6768d1321016 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -77,7 +77,8 @@
 #define X86_CR3_PWT            _BITUL(X86_CR3_PWT_BIT)
 #define X86_CR3_PCD_BIT                4 /* Page Cache Disable */
 #define X86_CR3_PCD            _BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK      _AC(0x00000fff,UL) /* PCID Mask */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
 
 /*
  * Intel CPU features in CR4
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 91588be529b9..918e44772b04 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -93,7 +93,7 @@ static const struct cpu_dev default_cpu = {
 
 static const struct cpu_dev *this_cpu = &default_cpu;
 
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
        /*
         * We need valid kernel segments for data and code in long mode too
@@ -327,8 +327,21 @@ static __always_inline void setup_smap(struct cpuinfo_x86 
*c)
 static void setup_pcid(struct cpuinfo_x86 *c)
 {
        if (cpu_has(c, X86_FEATURE_PCID)) {
-               if (cpu_has(c, X86_FEATURE_PGE)) {
+               if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
                        cr4_set_bits(X86_CR4_PCIDE);
+                       /*
+                        * INVPCID has two "groups" of types:
+                        * 1/2: Invalidate an individual address
+                        * 3/4: Invalidate all contexts
+                        *
+                        * 1/2 take a PCID, but 3/4 do not.  So, 3/4
+                        * ignore the PCID argument in the descriptor.
+                        * But, we have to be careful not to call 1/2
+                        * with an actual non-zero PCID in them before
+                        * we do the above cr4_set_bits().
+                        */
+                       if (cpu_has(c, X86_FEATURE_INVPCID))
+                               set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
                } else {
                        /*
                         * flush_tlb_all(), as currently implemented, won't
@@ -341,6 +354,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
                        clear_cpu_cap(c, X86_FEATURE_PCID);
                }
        }
+       kaiser_setup_pcid();
 }
 
 /*
@@ -1365,7 +1379,7 @@ static const unsigned int 
exception_stack_sizes[N_EXCEPTION_STACKS] = {
          [DEBUG_STACK - 1]                     = DEBUG_STKSZ
 };
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
        [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 
 /* May not be marked __init: used by software suspend */
@@ -1523,6 +1537,14 @@ void cpu_init(void)
         * try to read it.
         */
        cr4_init_shadow();
+       if (!kaiser_enabled) {
+               /*
+                * secondary_startup_64() deferred setting PGE in cr4:
+                * probe_page_size_mask() sets it on the boot cpu,
+                * but it needs to be set on each secondary cpu.
+                */
+               cr4_set_bits(X86_CR4_PGE);
+       }
 
        /*
         * Load microcode on this cpu if a valid microcode is available.
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 04f89caef9c4..e33b38541be3 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
 #include <asm/pgalloc.h>
 #include <asm/setup.h>
 #include <asm/espfix.h>
+#include <asm/kaiser.h>
 
 /*
  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,15 @@ void __init init_espfix_bsp(void)
        /* Install the espfix pud into the kernel page directory */
        pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
        pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+       /*
+        * Just copy the top-level PGD that is mapping the espfix
+        * area to ensure it is mapped into the shadow user page
+        * tables.
+        */
+       if (kaiser_enabled) {
+               set_pgd(native_get_shadow_pgd(pgd_p),
+                       __pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
+       }
 
        /* Randomize the locations */
        init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index b4421cc191b0..67cd7c1b99da 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -190,8 +190,8 @@ ENTRY(secondary_startup_64)
        movq    $(init_level4_pgt - __START_KERNEL_map), %rax
 1:
 
-       /* Enable PAE mode and PGE */
-       movl    $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+       /* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
+       movl    $(X86_CR4_PAE | X86_CR4_PSE), %ecx
        movq    %rcx, %cr4
 
        /* Setup early boot stage 4 level pagetables. */
@@ -405,6 +405,27 @@ GLOBAL(early_recursion_flag)
        .balign PAGE_SIZE; \
 GLOBAL(name)
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL   512
+/* This ensures they are 8k-aligned: */
+#define NEXT_PGD_PAGE(name) \
+       .balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL   0
+#endif
+
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)                       \
        i = 0 ;                                         \
@@ -414,9 +435,10 @@ GLOBAL(name)
        .endr
 
        __INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
        .fill   511,8,0
        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .fill   KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(early_dynamic_pgts)
        .fill   512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -424,16 +446,18 @@ NEXT_PAGE(early_dynamic_pgts)
        .data
 
 #ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
        .fill   512,8,0
+       .fill   KAISER_USER_PGD_FILL,8,0
 #else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
        .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
        .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
        .org    init_level4_pgt + L4_START_KERNEL*8, 0
        /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+       .fill   KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_ident_pgt)
        .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -444,6 +468,7 @@ NEXT_PAGE(level2_ident_pgt)
         */
        PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 #endif
+       .fill   KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_kernel_pgt)
        .fill   L3_START_KERNEL,8,0
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1423ab1b0312..f480b38a03c3 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
        .flags = IRQF_NO_THREAD,
 };
 
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
        [0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };
 
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 5f70014ca602..8bc68cfc0d33 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/kaiser.h>
 
 #include <asm/ldt.h>
 #include <asm/desc.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
        set_ldt(pc->ldt->entries, pc->ldt->size);
 }
 
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+       if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+               vfree(ldt->entries);
+       else
+               free_page((unsigned long)ldt->entries);
+       kfree(ldt);
+}
+
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. 
*/
 static struct ldt_struct *alloc_ldt_struct(int size)
 {
        struct ldt_struct *new_ldt;
        int alloc_size;
+       int ret;
 
        if (size > LDT_ENTRIES)
                return NULL;
@@ -66,7 +77,13 @@ static struct ldt_struct *alloc_ldt_struct(int size)
                return NULL;
        }
 
+       ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+                                __PAGE_KERNEL);
        new_ldt->size = size;
+       if (ret) {
+               __free_ldt_struct(new_ldt);
+               return NULL;
+       }
        return new_ldt;
 }
 
@@ -92,12 +109,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
        if (likely(!ldt))
                return;
 
+       kaiser_remove_mapping((unsigned long)ldt->entries,
+                             ldt->size * LDT_ENTRY_SIZE);
        paravirt_free_ldt(ldt->entries, ldt->size);
-       if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
-               vfree(ldt->entries);
-       else
-               free_page((unsigned long)ldt->entries);
-       kfree(ldt);
+       __free_ldt_struct(ldt);
 }
 
 /*
diff --git a/arch/x86/kernel/paravirt_patch_64.c 
b/arch/x86/kernel/paravirt_patch_64.c
index bb3840cedb4f..ee43b36075c7 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
@@ -59,7 +58,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
                PATCH_SITE(pv_mmu_ops, read_cr3);
                PATCH_SITE(pv_mmu_ops, write_cr3);
                PATCH_SITE(pv_cpu_ops, clts);
-               PATCH_SITE(pv_mmu_ops, flush_tlb_single);
                PATCH_SITE(pv_cpu_ops, wbinvd);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS)
                case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 8e10e72bf6ee..a55b32007785 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -41,7 +41,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, 
cpu_tss) = {
        .x86_tss = {
                .sp0 = TOP_OF_INIT_STACK,
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index feaab07fa124..6b55012d02a3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -114,6 +114,7 @@
 #include <asm/microcode.h>
 #include <asm/mmu_context.h>
 #include <asm/kaslr.h>
+#include <asm/kaiser.h>
 
 /*
  * max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -1019,6 +1020,12 @@ void __init setup_arch(char **cmdline_p)
         */
        init_hypervisor_platform();
 
+       /*
+        * This needs to happen right after XENPV is set on xen and
+        * kaiser_enabled is checked below in cleanup_highmap().
+        */
+       kaiser_check_boottime_disable();
+
        x86_init.resources.probe_roms();
 
        /* after parse_early_param, so could debug it */
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 1c113db9ed57..2bb5ee464df3 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -9,10 +9,12 @@
 #include <linux/atomic.h>
 
 atomic_t trace_idt_ctr = ATOMIC_INIT(0);
+__aligned(PAGE_SIZE)
 struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
                                (unsigned long) trace_idt_table };
 
 /* No need to be aligned, but done to keep all IDTs defined the same way. */
+__aligned(PAGE_SIZE)
 gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
 
 static int trace_irq_vector_refcount;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 7e28e6c877d9..73304b1a03cc 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -773,7 +773,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
                        return 1;
 
                /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
-               if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || 
!is_long_mode(vcpu))
+               if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
+                   !is_long_mode(vcpu))
                        return 1;
        }
 
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 5cc78bf57232..3261abb21ef4 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -104,7 +104,112 @@ __cmdline_find_option_bool(const char *cmdline, int 
max_cmdline_size,
        return 0;       /* Buffer overrun */
 }
 
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+                     const char *option, char *buffer, int bufsize)
+{
+       char c;
+       int pos = 0, len = -1;
+       const char *opptr = NULL;
+       char *bufptr = buffer;
+       enum {
+               st_wordstart = 0,       /* Start of word/after whitespace */
+               st_wordcmp,     /* Comparing this word */
+               st_wordskip,    /* Miscompare, skip */
+               st_bufcpy,      /* Copying this to buffer */
+       } state = st_wordstart;
+
+       if (!cmdline)
+               return -1;      /* No command line */
+
+       /*
+        * This 'pos' check ensures we do not overrun
+        * a non-NULL-terminated 'cmdline'
+        */
+       while (pos++ < max_cmdline_size) {
+               c = *(char *)cmdline++;
+               if (!c)
+                       break;
+
+               switch (state) {
+               case st_wordstart:
+                       if (myisspace(c))
+                               break;
+
+                       state = st_wordcmp;
+                       opptr = option;
+                       /* fall through */
+
+               case st_wordcmp:
+                       if ((c == '=') && !*opptr) {
+                               /*
+                                * We matched all the way to the end of the
+                                * option we were looking for, prepare to
+                                * copy the argument.
+                                */
+                               len = 0;
+                               bufptr = buffer;
+                               state = st_bufcpy;
+                               break;
+                       } else if (c == *opptr++) {
+                               /*
+                                * We are currently matching, so continue
+                                * to the next character on the cmdline.
+                                */
+                               break;
+                       }
+                       state = st_wordskip;
+                       /* fall through */
+
+               case st_wordskip:
+                       if (myisspace(c))
+                               state = st_wordstart;
+                       break;
+
+               case st_bufcpy:
+                       if (myisspace(c)) {
+                               state = st_wordstart;
+                       } else {
+                               /*
+                                * Increment len, but don't overrun the
+                                * supplied buffer and leave room for the
+                                * NULL terminator.
+                                */
+                               if (++len < bufsize)
+                                       *bufptr++ = c;
+                       }
+                       break;
+               }
+       }
+
+       if (bufsize)
+               *bufptr = '\0';
+
+       return len;
+}
+
 int cmdline_find_option_bool(const char *cmdline, const char *option)
 {
        return __cmdline_find_option_bool(cmdline, COMMAND_LINE_SIZE, option);
 }
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+                       int bufsize)
+{
+       return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+                                    buffer, bufsize);
+}
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 96d2b847e09e..c548b46100cb 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -37,5 +37,5 @@ obj-$(CONFIG_NUMA_EMU)                += numa_emulation.o
 
 obj-$(CONFIG_X86_INTEL_MPX)    += mpx.o
 obj-$(CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS) += pkeys.o
-obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
-
+obj-$(CONFIG_RANDOMIZE_MEMORY) += kaslr.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION)             += kaiser.o
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 0381638168d1..1e779bca4f3e 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -177,7 +177,7 @@ static void __init probe_page_size_mask(void)
                cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
        /* Enable PGE if available */
-       if (boot_cpu_has(X86_FEATURE_PGE)) {
+       if (boot_cpu_has(X86_FEATURE_PGE) && !kaiser_enabled) {
                cr4_set_bits_and_update_boot(X86_CR4_PGE);
                __supported_pte_mask |= _PAGE_GLOBAL;
        } else
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 3e27ded6ac65..7df8e3a79dc0 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -324,6 +324,16 @@ void __init cleanup_highmap(void)
                        continue;
                if (vaddr < (unsigned long) _text || vaddr > end)
                        set_pmd(pmd, __pmd(0));
+               else if (kaiser_enabled) {
+                       /*
+                        * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
+                        * clear that now.  This is not important, so long as
+                        * CR4.PGE remains clear, but it removes an anomaly.
+                        * Physical mapping setup below avoids _PAGE_GLOBAL
+                        * by use of massage_pgprot() inside pfn_pte() etc.
+                        */
+                       set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
+               }
        }
 }
 
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 000000000000..d8376b4ad9f0
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,455 @@
+#include <linux/bug.h>
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/uaccess.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
+
+#include <asm/kaiser.h>
+#include <asm/tlbflush.h>      /* to verify its kaiser declarations */
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#include <asm/cmdline.h>
+
+int kaiser_enabled __read_mostly = 1;
+EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */
+
+__visible
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3.  It would take
+ * another register.  So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
+
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes.  No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte().  We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
+
+/*
+ * Returns -1 on error.
+ */
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
+{
+       pgd_t *pgd;
+       pud_t *pud;
+       pmd_t *pmd;
+       pte_t *pte;
+
+       pgd = pgd_offset_k(vaddr);
+       /*
+        * We made all the kernel PGDs present in kaiser_init().
+        * We expect them to stay that way.
+        */
+       BUG_ON(pgd_none(*pgd));
+       /*
+        * PGDs are either 512GB or 128TB on all x86_64
+        * configurations.  We don't handle these.
+        */
+       BUG_ON(pgd_large(*pgd));
+
+       pud = pud_offset(pgd, vaddr);
+       if (pud_none(*pud)) {
+               WARN_ON_ONCE(1);
+               return -1;
+       }
+
+       if (pud_large(*pud))
+               return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
+
+       pmd = pmd_offset(pud, vaddr);
+       if (pmd_none(*pmd)) {
+               WARN_ON_ONCE(1);
+               return -1;
+       }
+
+       if (pmd_large(*pmd))
+               return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
+
+       pte = pte_offset_kernel(pmd, vaddr);
+       if (pte_none(*pte)) {
+               WARN_ON_ONCE(1);
+               return -1;
+       }
+
+       return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+}
+
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address)
+{
+       pmd_t *pmd;
+       pud_t *pud;
+       pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+       gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+
+       if (pgd_none(*pgd)) {
+               WARN_ONCE(1, "All shadow pgds should have been populated");
+               return NULL;
+       }
+       BUILD_BUG_ON(pgd_large(*pgd) != 0);
+
+       pud = pud_offset(pgd, address);
+       /* The shadow page tables do not use large mappings: */
+       if (pud_large(*pud)) {
+               WARN_ON(1);
+               return NULL;
+       }
+       if (pud_none(*pud)) {
+               unsigned long new_pmd_page = __get_free_page(gfp);
+               if (!new_pmd_page)
+                       return NULL;
+               spin_lock(&shadow_table_allocation_lock);
+               if (pud_none(*pud)) {
+                       set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+                       __inc_zone_page_state(virt_to_page((void *)
+                                               new_pmd_page), NR_KAISERTABLE);
+               } else
+                       free_page(new_pmd_page);
+               spin_unlock(&shadow_table_allocation_lock);
+       }
+
+       pmd = pmd_offset(pud, address);
+       /* The shadow page tables do not use large mappings: */
+       if (pmd_large(*pmd)) {
+               WARN_ON(1);
+               return NULL;
+       }
+       if (pmd_none(*pmd)) {
+               unsigned long new_pte_page = __get_free_page(gfp);
+               if (!new_pte_page)
+                       return NULL;
+               spin_lock(&shadow_table_allocation_lock);
+               if (pmd_none(*pmd)) {
+                       set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+                       __inc_zone_page_state(virt_to_page((void *)
+                                               new_pte_page), NR_KAISERTABLE);
+               } else
+                       free_page(new_pte_page);
+               spin_unlock(&shadow_table_allocation_lock);
+       }
+
+       return pte_offset_kernel(pmd, address);
+}
+
+static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+                              unsigned long flags)
+{
+       int ret = 0;
+       pte_t *pte;
+       unsigned long start_addr = (unsigned long )__start_addr;
+       unsigned long address = start_addr & PAGE_MASK;
+       unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+       unsigned long target_address;
+
+       /*
+        * It is convenient for callers to pass in __PAGE_KERNEL etc,
+        * and there is no actual harm from setting _PAGE_GLOBAL, so
+        * long as CR4.PGE is not set.  But it is nonetheless troubling
+        * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
+        * requires that not to be #defined to 0): so mask it off here.
+        */
+       flags &= ~_PAGE_GLOBAL;
+
+       for (; address < end_addr; address += PAGE_SIZE) {
+               target_address = get_pa_from_mapping(address);
+               if (target_address == -1) {
+                       ret = -EIO;
+                       break;
+               }
+               pte = kaiser_pagetable_walk(address);
+               if (!pte) {
+                       ret = -ENOMEM;
+                       break;
+               }
+               if (pte_none(*pte)) {
+                       set_pte(pte, __pte(flags | target_address));
+               } else {
+                       pte_t tmp;
+                       set_pte(&tmp, __pte(flags | target_address));
+                       WARN_ON_ONCE(!pte_same(*pte, tmp));
+               }
+       }
+       return ret;
+}
+
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, 
unsigned long flags)
+{
+       unsigned long size = end - start;
+
+       return kaiser_add_user_map(start, size, flags);
+}
+
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated.  This ensures that all processes that get
+ * forked have the same entries.  This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
+{
+       pgd_t *pgd;
+       int i = 0;
+
+       pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+       for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+               pgd_t new_pgd;
+               pud_t *pud = pud_alloc_one(&init_mm,
+                                          PAGE_OFFSET + i * PGDIR_SIZE);
+               if (!pud) {
+                       WARN_ON(1);
+                       break;
+               }
+               inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
+               new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+               /*
+                * Make sure not to stomp on some other pgd entry.
+                */
+               if (!pgd_none(pgd[i])) {
+                       WARN_ON(1);
+                       continue;
+               }
+               set_pgd(pgd + i, new_pgd);
+       }
+}
+
+#define kaiser_add_user_map_early(start, size, flags) do {     \
+       int __ret = kaiser_add_user_map(start, size, flags);    \
+       WARN_ON(__ret);                                         \
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do {         \
+       int __ret = kaiser_add_user_map_ptrs(start, end, flags);        \
+       WARN_ON(__ret);                                                 \
+} while (0)
+
+void __init kaiser_check_boottime_disable(void)
+{
+       bool enable = true;
+       char arg[5];
+       int ret;
+
+       if (boot_cpu_has(X86_FEATURE_XENPV))
+               goto silent_disable;
+
+       ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+       if (ret > 0) {
+               if (!strncmp(arg, "on", 2))
+                       goto enable;
+
+               if (!strncmp(arg, "off", 3))
+                       goto disable;
+
+               if (!strncmp(arg, "auto", 4))
+                       goto skip;
+       }
+
+       if (cmdline_find_option_bool(boot_command_line, "nopti"))
+               goto disable;
+
+skip:
+       if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+               goto disable;
+
+enable:
+       if (enable)
+               setup_force_cpu_cap(X86_FEATURE_KAISER);
+
+       return;
+
+disable:
+       pr_info("disabled\n");
+
+silent_disable:
+       kaiser_enabled = 0;
+       setup_clear_cpu_cap(X86_FEATURE_KAISER);
+}
+
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die.  But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it.  If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
+void __init kaiser_init(void)
+{
+       int cpu;
+
+       if (!kaiser_enabled)
+               return;
+
+       kaiser_init_all_pgds();
+
+       for_each_possible_cpu(cpu) {
+               void *percpu_vaddr = __per_cpu_user_mapped_start +
+                                    per_cpu_offset(cpu);
+               unsigned long percpu_sz = __per_cpu_user_mapped_end -
+                                         __per_cpu_user_mapped_start;
+               kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+                                         __PAGE_KERNEL);
+       }
+
+       /*
+        * Map the entry/exit text section, which is needed at
+        * switches from user to and from kernel.
+        */
+       kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+                                      __PAGE_KERNEL_RX);
+
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+       kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+                                      __irqentry_text_end,
+                                      __PAGE_KERNEL_RX);
+#endif
+       kaiser_add_user_map_early((void *)idt_descr.address,
+                                 sizeof(gate_desc) * NR_VECTORS,
+                                 __PAGE_KERNEL_RO);
+#ifdef CONFIG_TRACING
+       kaiser_add_user_map_early(&trace_idt_descr,
+                                 sizeof(trace_idt_descr),
+                                 __PAGE_KERNEL);
+       kaiser_add_user_map_early(&trace_idt_table,
+                                 sizeof(gate_desc) * NR_VECTORS,
+                                 __PAGE_KERNEL);
+#endif
+       kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
+                                 __PAGE_KERNEL);
+       kaiser_add_user_map_early(&debug_idt_table,
+                                 sizeof(gate_desc) * NR_VECTORS,
+                                 __PAGE_KERNEL);
+
+       pr_info("enabled\n");
+}
+
+/* Add a mapping to the shadow mapping, and synchronize the mappings */
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long 
flags)
+{
+       if (!kaiser_enabled)
+               return 0;
+       return kaiser_add_user_map((const void *)addr, size, flags);
+}
+
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+       extern void unmap_pud_range_nofree(pgd_t *pgd,
+                               unsigned long start, unsigned long end);
+       unsigned long end = start + size;
+       unsigned long addr, next;
+       pgd_t *pgd;
+
+       if (!kaiser_enabled)
+               return;
+       pgd = native_get_shadow_pgd(pgd_offset_k(start));
+       for (addr = start; addr < end; pgd++, addr = next) {
+               next = pgd_addr_end(addr, end);
+               unmap_pud_range_nofree(pgd, addr, next);
+       }
+}
+
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(pgd_t *pgdp)
+{
+       return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
+}
+
+pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+       if (!kaiser_enabled)
+               return pgd;
+       /*
+        * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
+        * skip cases like kexec and EFI which make temporary low mappings.
+        */
+       if (pgd.pgd & _PAGE_USER) {
+               if (is_userspace_pgd(pgdp)) {
+                       native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+                       /*
+                        * Even if the entry is *mapping* userspace, ensure
+                        * that userspace can not use it.  This way, if we
+                        * get out to userspace running on the kernel CR3,
+                        * userspace will crash instead of running.
+                        */
+                       if (__supported_pte_mask & _PAGE_NX)
+                               pgd.pgd |= _PAGE_NX;
+               }
+       } else if (!pgd.pgd) {
+               /*
+                * pgd_clear() cannot check _PAGE_USER, and is even used to
+                * clear corrupted pgd entries: so just rely on cases like
+                * kexec and EFI never to be using pgd_clear().
+                */
+               if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
+                   is_userspace_pgd(pgdp))
+                       native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+       }
+       return pgd;
+}
+
+void kaiser_setup_pcid(void)
+{
+       unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+       if (this_cpu_has(X86_FEATURE_PCID))
+               user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+       /*
+        * These variables are used by the entry/exit
+        * code to change PCID and pgd and TLB flushing.
+        */
+       this_cpu_write(x86_cr3_pcid_user, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+       if (this_cpu_has(X86_FEATURE_PCID))
+               this_cpu_write(x86_cr3_pcid_user,
+                       X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index aed206475aa7..319183d93602 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -189,6 +189,6 @@ void __meminit init_trampoline(void)
                *pud_tramp = *pud;
        }
 
-       set_pgd(&trampoline_pgd_entry,
-               __pgd(_KERNPG_TABLE | __pa(pud_page_tramp)));
+       /* Avoid set_pgd(), in case it's complicated by 
CONFIG_PAGE_TABLE_ISOLATION */
+       trampoline_pgd_entry = __pgd(_KERNPG_TABLE | __pa(pud_page_tramp));
 }
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index e3353c97d086..73dcb0e18c1b 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
 #define CPA_FLUSHTLB 1
 #define CPA_ARRAY 2
 #define CPA_PAGES_ARRAY 4
+#define CPA_FREE_PAGETABLES 8
 
 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -729,10 +730,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t 
*kpte,
        return 0;
 }
 
-static bool try_to_free_pte_page(pte_t *pte)
+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
 {
        int i;
 
+       if (!(cpa->flags & CPA_FREE_PAGETABLES))
+               return false;
+
        for (i = 0; i < PTRS_PER_PTE; i++)
                if (!pte_none(pte[i]))
                        return false;
@@ -741,10 +745,13 @@ static bool try_to_free_pte_page(pte_t *pte)
        return true;
 }
 
-static bool try_to_free_pmd_page(pmd_t *pmd)
+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
 {
        int i;
 
+       if (!(cpa->flags & CPA_FREE_PAGETABLES))
+               return false;
+
        for (i = 0; i < PTRS_PER_PMD; i++)
                if (!pmd_none(pmd[i]))
                        return false;
@@ -753,7 +760,9 @@ static bool try_to_free_pmd_page(pmd_t *pmd)
        return true;
 }
 
-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
+                           unsigned long start,
+                           unsigned long end)
 {
        pte_t *pte = pte_offset_kernel(pmd, start);
 
@@ -764,22 +773,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long 
start, unsigned long end)
                pte++;
        }
 
-       if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+       if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
                pmd_clear(pmd);
                return true;
        }
        return false;
 }
 
-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
                              unsigned long start, unsigned long end)
 {
-       if (unmap_pte_range(pmd, start, end))
-               if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+       if (unmap_pte_range(cpa, pmd, start, end))
+               if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
                        pud_clear(pud);
 }
 
-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
+                           unsigned long start, unsigned long end)
 {
        pmd_t *pmd = pmd_offset(pud, start);
 
@@ -790,7 +800,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long 
start, unsigned long end)
                unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
                unsigned long pre_end = min_t(unsigned long, end, next_page);
 
-               __unmap_pmd_range(pud, pmd, start, pre_end);
+               __unmap_pmd_range(cpa, pud, pmd, start, pre_end);
 
                start = pre_end;
                pmd++;
@@ -803,7 +813,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long 
start, unsigned long end)
                if (pmd_large(*pmd))
                        pmd_clear(pmd);
                else
-                       __unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+                       __unmap_pmd_range(cpa, pud, pmd,
+                                         start, start + PMD_SIZE);
 
                start += PMD_SIZE;
                pmd++;
@@ -813,17 +824,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long 
start, unsigned long end)
         * 4K leftovers?
         */
        if (start < end)
-               return __unmap_pmd_range(pud, pmd, start, end);
+               return __unmap_pmd_range(cpa, pud, pmd, start, end);
 
        /*
         * Try again to free the PMD page if haven't succeeded above.
         */
        if (!pud_none(*pud))
-               if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+               if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
                        pud_clear(pud);
 }
 
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
+                             unsigned long start,
+                             unsigned long end)
 {
        pud_t *pud = pud_offset(pgd, start);
 
@@ -834,7 +847,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long 
start, unsigned long end)
                unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
                unsigned long pre_end   = min_t(unsigned long, end, next_page);
 
-               unmap_pmd_range(pud, start, pre_end);
+               unmap_pmd_range(cpa, pud, start, pre_end);
 
                start = pre_end;
                pud++;
@@ -848,7 +861,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long 
start, unsigned long end)
                if (pud_large(*pud))
                        pud_clear(pud);
                else
-                       unmap_pmd_range(pud, start, start + PUD_SIZE);
+                       unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
 
                start += PUD_SIZE;
                pud++;
@@ -858,7 +871,7 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long 
start, unsigned long end)
         * 2M leftovers?
         */
        if (start < end)
-               unmap_pmd_range(pud, start, end);
+               unmap_pmd_range(cpa, pud, start, end);
 
        /*
         * No need to try to free the PUD page because we'll free it in
@@ -866,6 +879,24 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long 
start, unsigned long end)
         */
 }
 
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+       struct cpa_data cpa = {
+               .flags = CPA_FREE_PAGETABLES,
+       };
+
+       __unmap_pud_range(&cpa, pgd, start, end);
+}
+
+void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+       struct cpa_data cpa = {
+               .flags = 0,
+       };
+
+       __unmap_pud_range(&cpa, pgd, start, end);
+}
+
 static int alloc_pte_page(pmd_t *pmd)
 {
        pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3feec5af4e67..5aaec8effc5f 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -344,14 +344,22 @@ static inline void _pgd_free(pgd_t *pgd)
                kmem_cache_free(pgd_cache, pgd);
 }
 #else
+
+/*
+ * Instead of one pgd, Kaiser acquires two pgds.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER   kaiser_enabled
+
 static inline pgd_t *_pgd_alloc(void)
 {
-       return (pgd_t *)__get_free_page(PGALLOC_GFP);
+       return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
-       free_page((unsigned long)pgd);
+       free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 }
 #endif /* CONFIG_X86_PAE */
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 53b72fb4e781..41205de487e7 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,13 +6,14 @@
 #include <linux/interrupt.h>
 #include <linux/export.h>
 #include <linux/cpu.h>
+#include <linux/debugfs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/cache.h>
 #include <asm/apic.h>
 #include <asm/uv/uv.h>
-#include <linux/debugfs.h>
+#include <asm/kaiser.h>
 
 /*
  *     TLB flushing, formerly SMP-only
@@ -34,6 +35,36 @@ struct flush_tlb_info {
        unsigned long flush_end;
 };
 
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+       unsigned long new_mm_cr3 = __pa(pgdir);
+
+       if (kaiser_enabled) {
+               /*
+                * We reuse the same PCID for different tasks, so we must
+                * flush all the entries for the PCID out when we change tasks.
+                * Flush KERN below, flush USER when returning to userspace in
+                * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+                *
+                * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+                * do it here, but can only be used if X86_FEATURE_INVPCID is
+                * available - and many machines support pcid without invpcid.
+                *
+                * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
+                * would be needed in the write_cr3() below - if PCIDs enabled.
+                */
+               BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
+               kaiser_flush_tlb_on_return_to_user();
+       }
+
+       /*
+        * Caution: many callers of this function expect
+        * that load_cr3() is serializing and orders TLB
+        * fills with respect to the mm_cpumask writes.
+        */
+       write_cr3(new_mm_cr3);
+}
+
 /*
  * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
@@ -45,7 +76,7 @@ void leave_mm(int cpu)
                BUG();
        if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
                cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
-               load_cr3(swapper_pg_dir);
+               load_new_mm_cr3(swapper_pg_dir);
                /*
                 * This gets called in the idle path where RCU
                 * functions differently.  Tracing normally
@@ -120,7 +151,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
                 * ordering guarantee we need.
                 *
                 */
-               load_cr3(next->pgd);
+               load_new_mm_cr3(next->pgd);
 
                trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 
@@ -167,7 +198,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
                         * As above, load_cr3() is serializing and orders TLB
                         * fills with respect to the mm_cpumask write.
                         */
-                       load_cr3(next->pgd);
+                       load_new_mm_cr3(next->pgd);
                        trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, 
TLB_FLUSH_ALL);
                        load_mm_cr4(next);
                        load_mm_ldt(next);
diff --git a/include/asm-generic/vmlinux.lds.h 
b/include/asm-generic/vmlinux.lds.h
index dc81e5287ebf..2e6000a4eb2c 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -778,7 +778,14 @@
  */
 #define PERCPU_INPUT(cacheline)                                                
\
        VMLINUX_SYMBOL(__per_cpu_start) = .;                            \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_start) = .;                \
        *(.data..percpu..first)                                         \
+       . = ALIGN(cacheline);                                           \
+       *(.data..percpu..user_mapped)                                   \
+       *(.data..percpu..user_mapped..shared_aligned)                   \
+       . = ALIGN(PAGE_SIZE);                                           \
+       *(.data..percpu..user_mapped..page_aligned)                     \
+       VMLINUX_SYMBOL(__per_cpu_user_mapped_end) = .;                  \
        . = ALIGN(PAGE_SIZE);                                           \
        *(.data..percpu..page_aligned)                                  \
        . = ALIGN(cacheline);                                           \
diff --git a/include/linux/kaiser.h b/include/linux/kaiser.h
new file mode 100644
index 000000000000..58c55b1589d0
--- /dev/null
+++ b/include/linux/kaiser.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_KAISER_H
+#define _LINUX_KAISER_H
+
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#include <asm/kaiser.h>
+
+static inline int kaiser_map_thread_stack(void *stack)
+{
+       /*
+        * Map that page of kernel stack on which we enter from user context.
+        */
+       return kaiser_add_mapping((unsigned long)stack +
+                       THREAD_SIZE - PAGE_SIZE, PAGE_SIZE, __PAGE_KERNEL);
+}
+
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+       /*
+        * Note: may be called even when kaiser_map_thread_stack() failed.
+        */
+       kaiser_remove_mapping((unsigned long)stack +
+                       THREAD_SIZE - PAGE_SIZE, PAGE_SIZE);
+}
+#else
+
+/*
+ * These stubs are used whenever CONFIG_PAGE_TABLE_ISOLATION is off, which
+ * includes architectures that support KAISER, but have it disabled.
+ */
+
+static inline void kaiser_init(void)
+{
+}
+static inline int kaiser_add_mapping(unsigned long addr,
+                                    unsigned long size, unsigned long flags)
+{
+       return 0;
+}
+static inline void kaiser_remove_mapping(unsigned long start,
+                                        unsigned long size)
+{
+}
+static inline int kaiser_map_thread_stack(void *stack)
+{
+       return 0;
+}
+static inline void kaiser_unmap_thread_stack(void *stack)
+{
+}
+
+#endif /* !CONFIG_PAGE_TABLE_ISOLATION */
+#endif /* _LINUX_KAISER_H */
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index fff21a82780c..490f5a83f947 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -124,8 +124,9 @@ enum zone_stat_item {
        NR_SLAB_UNRECLAIMABLE,
        NR_PAGETABLE,           /* used for pagetables */
        NR_KERNEL_STACK_KB,     /* measured in KiB */
-       /* Second 128 byte cacheline */
+       NR_KAISERTABLE,
        NR_BOUNCE,
+       /* Second 128 byte cacheline */
 #if IS_ENABLED(CONFIG_ZSMALLOC)
        NR_ZSPAGES,             /* allocated in zsmalloc */
 #endif
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index 8f16299ca068..8902f23bb770 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -35,6 +35,12 @@
 
 #endif
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+#define USER_MAPPED_SECTION "..user_mapped"
+#else
+#define USER_MAPPED_SECTION ""
+#endif
+
 /*
  * Base implementations of per-CPU variable declarations and definitions, where
  * the section in which the variable is to be placed is provided by the
@@ -115,6 +121,12 @@
 #define DEFINE_PER_CPU(type, name)                                     \
        DEFINE_PER_CPU_SECTION(type, name, "")
 
+#define DECLARE_PER_CPU_USER_MAPPED(type, name)                                
\
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
+#define DEFINE_PER_CPU_USER_MAPPED(type, name)                         \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION)
+
 /*
  * Declaration/definition used for per-CPU variables that must come first in
  * the set of variables.
@@ -144,6 +156,14 @@
        DEFINE_PER_CPU_SECTION(type, name, PER_CPU_SHARED_ALIGNED_SECTION) \
        ____cacheline_aligned_in_smp
 
+#define DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)         \
+       DECLARE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION 
PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
+#define DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(type, name)          \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION 
PER_CPU_SHARED_ALIGNED_SECTION) \
+       ____cacheline_aligned_in_smp
+
 #define DECLARE_PER_CPU_ALIGNED(type, name)                            \
        DECLARE_PER_CPU_SECTION(type, name, PER_CPU_ALIGNED_SECTION)    \
        ____cacheline_aligned
@@ -162,11 +182,21 @@
 #define DEFINE_PER_CPU_PAGE_ALIGNED(type, name)                                
\
        DEFINE_PER_CPU_SECTION(type, name, "..page_aligned")            \
        __aligned(PAGE_SIZE)
+/*
+ * Declaration/definition used for per-CPU variables that must be page aligned 
and need to be mapped in user mode.
+ */
+#define DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)           \
+       DECLARE_PER_CPU_SECTION(type, name, 
USER_MAPPED_SECTION"..page_aligned") \
+       __aligned(PAGE_SIZE)
+
+#define DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(type, name)            \
+       DEFINE_PER_CPU_SECTION(type, name, USER_MAPPED_SECTION"..page_aligned") 
\
+       __aligned(PAGE_SIZE)
 
 /*
  * Declaration/definition used for per-CPU variables that must be read mostly.
  */
-#define DECLARE_PER_CPU_READ_MOSTLY(type, name)                        \
+#define DECLARE_PER_CPU_READ_MOSTLY(type, name)                                
\
        DECLARE_PER_CPU_SECTION(type, name, "..read_mostly")
 
 #define DEFINE_PER_CPU_READ_MOSTLY(type, name)                         \
diff --git a/init/main.c b/init/main.c
index 25bac88bc66e..99f026565608 100644
--- a/init/main.c
+++ b/init/main.c
@@ -80,6 +80,7 @@
 #include <linux/integrity.h>
 #include <linux/proc_ns.h>
 #include <linux/io.h>
+#include <linux/kaiser.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -473,6 +474,7 @@ static void __init mm_init(void)
        pgtable_init();
        vmalloc_init();
        ioremap_huge_init();
+       kaiser_init();
 }
 
 asmlinkage __visible void __init start_kernel(void)
diff --git a/kernel/fork.c b/kernel/fork.c
index 9321b1ad3335..70e10cb49be0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -58,6 +58,7 @@
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
 #include <linux/freezer.h>
+#include <linux/kaiser.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
 #include <linux/random.h>
@@ -213,6 +214,7 @@ static unsigned long *alloc_thread_stack_node(struct 
task_struct *tsk, int node)
 
 static inline void free_thread_stack(struct task_struct *tsk)
 {
+       kaiser_unmap_thread_stack(tsk->stack);
 #ifdef CONFIG_VMAP_STACK
        if (task_stack_vm_area(tsk)) {
                unsigned long flags;
@@ -495,6 +497,10 @@ static struct task_struct *dup_task_struct(struct 
task_struct *orig, int node)
         * functions again.
         */
        tsk->stack = stack;
+
+       err= kaiser_map_thread_stack(tsk->stack);
+       if (err)
+               goto free_stack;
 #ifdef CONFIG_VMAP_STACK
        tsk->stack_vm_area = stack_vm_area;
 #endif
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 604f26a4f696..6a088df04b29 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -932,6 +932,7 @@ const char * const vmstat_text[] = {
        "nr_slab_unreclaimable",
        "nr_page_table_pages",
        "nr_kernel_stack",
+       "nr_overhead",
        "nr_bounce",
 #if IS_ENABLED(CONFIG_ZSMALLOC)
        "nr_zspages",
diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c
index 97f9cac98348..e86a34fd5484 100644
--- a/net/ipv4/tcp_bbr.c
+++ b/net/ipv4/tcp_bbr.c
@@ -843,6 +843,11 @@ static u32 bbr_sndbuf_expand(struct sock *sk)
  */
 static u32 bbr_undo_cwnd(struct sock *sk)
 {
+       struct bbr *bbr = inet_csk_ca(sk);
+
+       bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
+       bbr->full_bw_cnt = 0;
+       bbr_reset_lt_bw_sampling(sk);
        return tcp_sk(sk)->snd_cwnd;
 }
 
diff --git a/security/Kconfig b/security/Kconfig
index 118f4549404e..32f36b40e9f0 100644
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -31,6 +31,16 @@ config SECURITY
 
          If you are unsure how to answer this question, answer N.
 
+config PAGE_TABLE_ISOLATION
+       bool "Remove the kernel mapping in user mode"
+       default y
+       depends on X86_64 && SMP
+       help
+         This enforces a strict kernel and user space isolation, in order
+         to close hardware side channels on kernel address information.
+
+         If you are unsure how to answer this question, answer Y.
+
 config SECURITYFS
        bool "Enable the securityfs filesystem"
        help
diff --git a/tools/arch/x86/include/asm/cpufeatures.h 
b/tools/arch/x86/include/asm/cpufeatures.h
index a39629206864..f79669a38c0c 100644
--- a/tools/arch/x86/include/asm/cpufeatures.h
+++ b/tools/arch/x86/include/asm/cpufeatures.h
@@ -197,6 +197,9 @@
 #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network 
Instructions */
 #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation 
Single precision */
 
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER     ( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o 
nokaiser */
+
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
 #define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
Re: Linux 4.9.75

Reply via email to