[PATCH v25 28/30] mm: Move arch_calc_vm_prot_bits() to arch/x86/include/asm/mman.h

2021-04-15 Thread Yu-cheng Yu
To prepare the introduction of PROT_SHSTK and be consistent with other
architectures, move arch_vm_get_page_prot() and arch_calc_vm_prot_bits() to
arch/x86/include/asm/mman.h.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Cc: Kirill A. Shutemov 
---
 arch/x86/include/asm/mman.h  | 30 ++
 arch/x86/include/uapi/asm/mman.h | 27 +++
 2 files changed, 33 insertions(+), 24 deletions(-)
 create mode 100644 arch/x86/include/asm/mman.h

diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
new file mode 100644
index ..629f6c81263a
--- /dev/null
+++ b/arch/x86/include/asm/mman.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MMAN_H
+#define _ASM_X86_MMAN_H
+
+#include 
+#include 
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+/*
+ * Take the 4 protection key bits out of the vma->vm_flags
+ * value and turn them in to the bits that we can put in
+ * to a pte.
+ *
+ * Only override these if Protection Keys are available
+ * (which is only on 64-bit).
+ */
+#define arch_vm_get_page_prot(vm_flags)__pgprot(   \
+   ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
+   ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
+   ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
+   ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
+
+#define arch_calc_vm_prot_bits(prot, key) (\
+   ((key) & 0x1 ? VM_PKEY_BIT0 : 0) |  \
+   ((key) & 0x2 ? VM_PKEY_BIT1 : 0) |  \
+   ((key) & 0x4 ? VM_PKEY_BIT2 : 0) |  \
+   ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#endif
+
+#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index d4a8d0424bfb..3ce1923e6ed9 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -1,31 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_X86_MMAN_H
-#define _ASM_X86_MMAN_H
+#ifndef _UAPI_ASM_X86_MMAN_H
+#define _UAPI_ASM_X86_MMAN_H
 
 #define MAP_32BIT  0x40/* only give out 32bit addresses */
 
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-/*
- * Take the 4 protection key bits out of the vma->vm_flags
- * value and turn them in to the bits that we can put in
- * to a pte.
- *
- * Only override these if Protection Keys are available
- * (which is only on 64-bit).
- */
-#define arch_vm_get_page_prot(vm_flags)__pgprot(   \
-   ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
-   ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
-   ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
-   ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
-
-#define arch_calc_vm_prot_bits(prot, key) (\
-   ((key) & 0x1 ? VM_PKEY_BIT0 : 0) |  \
-   ((key) & 0x2 ? VM_PKEY_BIT1 : 0) |  \
-   ((key) & 0x4 ? VM_PKEY_BIT2 : 0) |  \
-   ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
-#endif
 
 #include 
 
-#endif /* _ASM_X86_MMAN_H */
+#endif /* _UAPI_ASM_X86_MMAN_H */
-- 
2.21.0



[PATCH v25 9/9] x86/vdso: Add ENDBR to __vdso_sgx_enter_enclave

2021-04-15 Thread Yu-cheng Yu
ENDBR is a special new instruction for the Indirect Branch Tracking (IBT)
component of CET.  IBT prevents attacks by ensuring that (most) indirect
branches and function calls may only land at ENDBR instructions.  Branches
that don't follow the rules will result in control flow (#CF) exceptions.

ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
instructions are inserted automatically by the compiler, but branch
targets written in assembly must have ENDBR added manually.

Add ENDBR to __vdso_sgx_enter_enclave() branch targets.

Signed-off-by: Yu-cheng Yu 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: Jarkko Sakkinen 
Cc: Peter Zijlstra 
---
 arch/x86/entry/vdso/vsgx.S | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
index 86a0e94f68df..f3ebd38d1898 100644
--- a/arch/x86/entry/vdso/vsgx.S
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -4,6 +4,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "extable.h"
 
@@ -27,6 +28,7 @@
 SYM_FUNC_START(__vdso_sgx_enter_enclave)
/* Prolog */
.cfi_startproc
+   ENDBR64
push%rbp
.cfi_adjust_cfa_offset  8
.cfi_rel_offset %rbp, 0
@@ -62,6 +64,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
 .Lasync_exit_pointer:
 .Lenclu_eenter_eresume:
enclu
+   ENDBR64
 
/* EEXIT jumps here unless the enclave is doing something fancy. */
mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
@@ -91,6 +94,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
jmp .Lout
 
 .Lhandle_exception:
+   ENDBR64
mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
 
/* Set the exception info. */
-- 
2.21.0



[PATCH v25 8/9] x86/vdso/32: Add ENDBR to __kernel_vsyscall entry point

2021-04-15 Thread Yu-cheng Yu
From: "H.J. Lu" 

ENDBR is a special new instruction for the Indirect Branch Tracking (IBT)
component of CET.  IBT prevents attacks by ensuring that (most) indirect
branches and function calls may only land at ENDBR instructions.  Branches
that don't follow the rules will result in control flow (#CF) exceptions.

ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
instructions are inserted automatically by the compiler, but branch
targets written in assembly must have ENDBR added manually.

Add that to __kernel_vsyscall entry point.

Signed-off-by: H.J. Lu 
Signed-off-by: Yu-cheng Yu 
Cc: Andy Lutomirski 
Cc: Kees Cook 
---
 arch/x86/entry/vdso/vdso32/system_call.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/entry/vdso/vdso32/system_call.S 
b/arch/x86/entry/vdso/vdso32/system_call.S
index de1fff7188aa..7793dc221726 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
.text
.globl __kernel_vsyscall
@@ -14,6 +15,7 @@
ALIGN
 __kernel_vsyscall:
CFI_STARTPROC
+   ENDBR32
/*
 * Reshuffle regs so that all of any of the entry instructions
 * will preserve enough state.
-- 
2.21.0



[PATCH v25 7/9] x86/vdso: Introduce ENDBR macro

2021-04-15 Thread Yu-cheng Yu
ENDBR is a special new instruction for the Indirect Branch Tracking (IBT)
component of CET.  IBT prevents attacks by ensuring that (most) indirect
branches and function calls may only land at ENDBR instructions.  Branches
that don't follow the rules will result in control flow (#CF) exceptions.

ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
instructions are inserted automatically by the compiler, but branch
targets written in assembly must have ENDBR added manually.

There are two ENDBR versions: endbr64 and endbr32.  The compilers (gcc and
clang) have _CET_ENDBR defined for the proper one.  Introduce ENDBR macro,
which equals the compiler macro when enabled, otherwise nothing.

Signed-off-by: Yu-cheng Yu 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: Jarkko Sakkinen 
Cc: Peter Zijlstra 
---
v25:
- Change from using the compiler's cet.h back to just ENDBR64/ENDBR32,
  since the information is already known, and keep it simple.

 arch/x86/include/asm/vdso.h | 20 +++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 98aa103eb4ab..97358246e4c7 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -52,6 +52,24 @@ extern int map_vdso_once(const struct vdso_image *image, 
unsigned long addr);
 extern bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
 unsigned long error_code,
 unsigned long fault_addr);
-#endif /* __ASSEMBLER__ */
+#else /* __ASSEMBLER__ */
+
+/*
+ * ENDBR is an instruction for the Indirect Branch Tracking (IBT) component
+ * of CET.  IBT prevents attacks by ensuring that (most) indirect branches
+ * function calls may only land at ENDBR instructions.  Branches that don't
+ * follow the rules will result in control flow (#CF) exceptions.
+ * ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
+ * instructions are inserted automatically by the compiler, but branch
+ * targets written in assembly must have ENDBR added manually.
+ */
+#ifdef CONFIG_X86_IBT
+#define ENDBR64 endbr64
+#define ENDBR32 endbr32
+#else
+#define ENDBR64
+#define ENDBR32
+#endif
 
+#endif /* __ASSEMBLER__ */
 #endif /* _ASM_X86_VDSO_H */
-- 
2.21.0



[PATCH v25 6/9] x86/vdso: Insert endbr32/endbr64 to vDSO

2021-04-15 Thread Yu-cheng Yu
From: "H.J. Lu" 

When Indirect Branch Tracking (IBT) is enabled, vDSO functions may be
called indirectly, and must have ENDBR32 or ENDBR64 as the first
instruction.  The compiler must support -fcf-protection=branch so that it
can be used to compile vDSO.

Signed-off-by: H.J. Lu 
Signed-off-by: Yu-cheng Yu 
Cc: Andy Lutomirski 
Cc: Kees Cook 
---
v24:
- Replace CONFIG_X86_CET with CONFIG_X86_IBT to reflect splitting of shadow
  stack and ibt.

 arch/x86/entry/vdso/Makefile | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 05c4abc2fdfd..a773a5f03b63 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -93,6 +93,10 @@ endif
 
 $(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(GCC_PLUGINS_CFLAGS) 
$(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
 
+ifdef CONFIG_X86_IBT
+$(vobjs) $(vobjs32): KBUILD_CFLAGS += -fcf-protection=branch
+endif
+
 #
 # vDSO code runs in userspace and -pg doesn't help with profiling anyway.
 #
-- 
2.21.0



[PATCH v25 4/9] x86/cet/ibt: Update ELF header parsing for Indirect Branch Tracking

2021-04-15 Thread Yu-cheng Yu
An ELF file's .note.gnu.property indicates features the file supports.
The property is parsed at loading time and passed to arch_setup_elf_
property().  Update it for Indirect Branch Tracking.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Update for splitting shadow stack and ibt.

 arch/x86/Kconfig | 2 ++
 arch/x86/kernel/process_64.c | 8 
 2 files changed, 10 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6bb69fba0dad..7436e3a608e8 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1970,6 +1970,8 @@ config X86_IBT
def_bool n
depends on X86_SHADOW_STACK
depends on $(cc-option,-fcf-protection)
+   select ARCH_USE_GNU_PROPERTY
+   select ARCH_BINFMT_ELF_STATE
help
  Indirect Branch Tracking (IBT) provides protection against
  CALL-/JMP-oriented programming attacks.  It is active when
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d71045b29475..bf8ef10e5b78 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -864,6 +864,14 @@ int arch_setup_elf_property(struct arch_elf_state *state)
r = shstk_setup();
}
 
+   if (r < 0)
+   return r;
+
+   if (cpu_feature_enabled(X86_FEATURE_IBT)) {
+   if (state->gnu_property & GNU_PROPERTY_X86_FEATURE_1_IBT)
+   r = ibt_setup();
+   }
+
return r;
 }
 #endif
-- 
2.21.0



[PATCH v25 5/9] x86/cet/ibt: Update arch_prctl functions for Indirect Branch Tracking

2021-04-15 Thread Yu-cheng Yu
From: "H.J. Lu" 

Update ARCH_X86_CET_STATUS and ARCH_X86_CET_DISABLE for Indirect Branch
Tracking.

Signed-off-by: H.J. Lu 
Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
v24:
- Update for function name changes from splitting shadow stack and ibt.

 arch/x86/kernel/cet_prctl.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
index 3bb9f32ca70d..ab05597545c5 100644
--- a/arch/x86/kernel/cet_prctl.c
+++ b/arch/x86/kernel/cet_prctl.c
@@ -22,6 +22,9 @@ static int cet_copy_status_to_user(struct cet_status *cet, 
u64 __user *ubuf)
buf[2] = cet->shstk_size;
}
 
+   if (cet->ibt_enabled)
+   buf[0] |= GNU_PROPERTY_X86_FEATURE_1_IBT;
+
return copy_to_user(ubuf, buf, sizeof(buf));
 }
 
@@ -46,6 +49,8 @@ int prctl_cet(int option, u64 arg2)
return -EINVAL;
if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
shstk_disable();
+   if (arg2 & GNU_PROPERTY_X86_FEATURE_1_IBT)
+   ibt_disable();
return 0;
 
case ARCH_X86_CET_LOCK:
-- 
2.21.0



[PATCH v25 3/9] x86/cet/ibt: Handle signals for Indirect Branch Tracking

2021-04-15 Thread Yu-cheng Yu
When an indirect CALL/JMP instruction is executed and before it reaches
the target, it is in 'WAIT_ENDBR' status, which can be read from
MSR_IA32_U_CET.  The status is part of a task's status before a signal is
raised and preserved in the signal frame.  It is restored for sigreturn.

IBT state machine is described in Intel SDM Vol. 1, Sec. 18.3.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v25:
- Move the addition of sc_ext.wait_endbr from an earlier shadow stack
  patch to here.
- Change X86_FEATURE_CET to X86_FEATURE_SHSTK.
- Change wrmsrl() to wrmsrl_safe() and handle error.
v24:
- Update for changes from splitting shadow stack and ibt.

 arch/x86/include/uapi/asm/sigcontext.h |  1 +
 arch/x86/kernel/fpu/signal.c   | 33 +++---
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/uapi/asm/sigcontext.h 
b/arch/x86/include/uapi/asm/sigcontext.h
index 10d7fa192d48..ee5bacce7d87 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -203,6 +203,7 @@ struct _xstate {
 struct sc_ext {
unsigned long total_size;
unsigned long ssp;
+   unsigned long wait_endbr;
 };
 
 /*
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 0488407bec81..0ed01e70b09e 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -71,16 +71,29 @@ int save_extra_state_to_sigframe(int ia32, void __user *fp, 
void __user *restore
return err;
 
ext.ssp = token_addr;
+   }
 
+   if (new_ssp || cet->ibt_enabled) {
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
__fpregs_load_activate();
if (new_ssp)
err = wrmsrl_safe(MSR_IA32_PL3_SSP, new_ssp);
+
+   if (!err && cet->ibt_enabled) {
+   u64 msr_val;
+
+   err = rdmsrl_safe(MSR_IA32_U_CET, &msr_val);
+   if (!err && (msr_val & CET_WAIT_ENDBR)) {
+   ext.wait_endbr = 1;
+   msr_val &= ~CET_WAIT_ENDBR;
+   err = wrmsrl_safe(MSR_IA32_U_CET, msr_val);
+   }
+   }
fpregs_unlock();
}
 
-   if (!err && ext.ssp) {
+   if (!err && (ext.ssp || cet->ibt_enabled)) {
void __user *p = fp;
 
ext.total_size = sizeof(ext);
@@ -110,7 +123,8 @@ static int get_extra_state_from_sigframe(int ia32, void 
__user *fp, struct sc_ex
if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
return 0;
 
-   if (!cet->shstk_size)
+   if (!cet->shstk_size &&
+   !cet->ibt_enabled)
return 0;
 
memset(ext, 0, sizeof(*ext));
@@ -149,6 +163,19 @@ static int restore_extra_state_to_xregs(struct sc_ext 
*sc_ext)
 
if (cet->shstk_size)
err = wrmsrl_safe(MSR_IA32_PL3_SSP, sc_ext->ssp);
+
+   if (err)
+   return err;
+
+   if (cet->ibt_enabled && sc_ext->wait_endbr) {
+   u64 msr_val;
+
+   err = rdmsrl_safe(MSR_IA32_U_CET, &msr_val);
+   if (!err) {
+   msr_val |= CET_WAIT_ENDBR;
+   err = wrmsrl_safe(MSR_IA32_U_CET, msr_val);
+   }
+   }
 #endif
return err;
 }
@@ -616,7 +643,7 @@ static unsigned long fpu__alloc_sigcontext_ext(unsigned 
long sp)
 * sigcontext_ext is at: fpu + fpu_user_xstate_size +
 * FP_XSTATE_MAGIC2_SIZE, then aligned to 8.
 */
-   if (cet->shstk_size)
+   if (cet->shstk_size || cet->ibt_enabled)
sp -= (sizeof(struct sc_ext) + 8);
 #endif
return sp;
-- 
2.21.0



[PATCH v25 30/30] mm: Introduce PROT_SHSTK for shadow stack

2021-04-15 Thread Yu-cheng Yu
There are three possible options to create a shadow stack allocation API:
an arch_prctl, a new syscall, or adding PROT_SHSTK to mmap()/mprotect().
Each has its advantages and compromises.

An arch_prctl() is the least intrusive.  However, the existing x86
arch_prctl() takes only two parameters.  Multiple parameters must be
passed in a memory buffer.  There is a proposal to pass more parameters in
registers [1], but no active discussion on that.

A new syscall minimizes compatibility issues and offers an extensible frame
work to other architectures, but this will likely result in some overlap of
mmap()/mprotect().

The introduction of PROT_SHSTK to mmap()/mprotect() takes advantage of
existing APIs.  The x86-specific PROT_SHSTK is translated to
VM_SHADOW_STACK and a shadow stack mapping is created without reinventing
the wheel.  There are potential pitfalls though.  The most obvious one
would be using this as a bypass to shadow stack protection.  However, the
attacker would have to get to the syscall first.

[1] https://lore.kernel.org/lkml/20200828121624.108243-1-hjl.to...@gmail.com/

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
Cc: Kirill A. Shutemov 
---
v24:
- Update arch_calc_vm_prot_bits(), leave PROT* checking to
  arch_validate_prot().
- Update arch_validate_prot(), leave vma flags checking to
  arch_validate_flags().
- Add arch_validate_flags().

 arch/x86/include/asm/mman.h  | 59 +++-
 arch/x86/include/uapi/asm/mman.h |  1 +
 include/linux/mm.h   |  1 +
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 629f6c81263a..1821c179f35d 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -20,11 +20,68 @@
((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
 
-#define arch_calc_vm_prot_bits(prot, key) (\
+#define pkey_vm_prot_bits(prot, key) ( \
((key) & 0x1 ? VM_PKEY_BIT0 : 0) |  \
((key) & 0x2 ? VM_PKEY_BIT1 : 0) |  \
((key) & 0x4 ? VM_PKEY_BIT2 : 0) |  \
((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#else
+#define pkey_vm_prot_bits(prot, key) (0)
 #endif
 
+static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+  unsigned long pkey)
+{
+   unsigned long vm_prot_bits = pkey_vm_prot_bits(prot, pkey);
+
+   if (prot & PROT_SHSTK)
+   vm_prot_bits |= VM_SHADOW_STACK;
+
+   return vm_prot_bits;
+}
+
+#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
+
+#ifdef CONFIG_X86_SHADOW_STACK
+static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
+{
+   unsigned long valid = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM |
+ PROT_SHSTK;
+
+   if (prot & ~valid)
+   return false;
+
+   if (prot & PROT_SHSTK) {
+   if (!current->thread.cet.shstk_size)
+   return false;
+
+   /*
+* A shadow stack mapping is indirectly writable by only
+* the CALL and WRUSS instructions, but not other write
+* instructions).  PROT_SHSTK and PROT_WRITE are mutually
+* exclusive.
+*/
+   if (prot & PROT_WRITE)
+   return false;
+   }
+
+   return true;
+}
+
+#define arch_validate_prot arch_validate_prot
+
+static inline bool arch_validate_flags(unsigned long vm_flags, bool is_anon)
+{
+   if (vm_flags & VM_SHADOW_STACK) {
+   if ((vm_flags & VM_SHARED) || !is_anon)
+   return false;
+   }
+
+   return true;
+}
+
+#define arch_validate_flags(vm_flags, is_anon) arch_validate_flags(vm_flags, 
is_anon)
+
+#endif /* CONFIG_X86_SHADOW_STACK */
+
 #endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 3ce1923e6ed9..39bb7db344a6 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -4,6 +4,7 @@
 
 #define MAP_32BIT  0x40/* only give out 32bit addresses */
 
+#define PROT_SHSTK 0x10/* shadow stack pages */
 
 #include 
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ccec5cc399b..9a7652eea207 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -342,6 +342,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #if defined(CONFIG_X86)
 # define VM_PATVM_ARCH_1   /* PAT reserves whole VMA at 
once (x86) */
+# define VM_ARCH_CLEAR VM_SHADOW_STACK
 #elif defined(CONFIG_PPC)
 # define VM_SAOVM_ARCH_1   /* Strong Access Ordering 
(powerpc) */
 #elif defined(CONFIG_PARISC)
-- 
2.21.0



[PATCH v25 29/30] mm: Update arch_validate_flags() to include vma anonymous

2021-04-15 Thread Yu-cheng Yu
When newer VM flags are being created, such as VM_MTE, it becomes necessary
for mmap/mprotect to verify if certain flags are being applied to an
anonymous VMA.

To solve this, one approach is adding a VM flag to track that MAP_ANONYMOUS
is specified [1], and then using the flag in arch_validate_flags().

Another approach is passing vma_is_anonymous() to arch_validate_flags().
To prepare the introduction of PROT_SHSTK, which creates a shadow stack
mapping and can only be applied to an anonymous VMA, update arch_validate_
flags() to include anonymous VMA information.

[1] commit 9f3419315f3c ("arm64: mte: Add PROT_MTE support to mmap() and 
mprotect()"),

Signed-off-by: Yu-cheng Yu 
Cc: Catalin Marinas 
Cc: Kees Cook 
Cc: Kirill A. Shutemov 
Cc: Vincenzo Frascino 
Cc: Will Deacon 
---
 arch/arm64/include/asm/mman.h | 4 ++--
 arch/sparc/include/asm/mman.h | 4 ++--
 include/linux/mman.h  | 2 +-
 mm/mmap.c | 2 +-
 mm/mprotect.c | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index e3e28f7daf62..44add1a09041 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -74,7 +74,7 @@ static inline bool arch_validate_prot(unsigned long prot,
 }
 #define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr)
 
-static inline bool arch_validate_flags(unsigned long vm_flags)
+static inline bool arch_validate_flags(unsigned long vm_flags, bool is_anon)
 {
if (!system_supports_mte())
return true;
@@ -82,6 +82,6 @@ static inline bool arch_validate_flags(unsigned long vm_flags)
/* only allow VM_MTE if VM_MTE_ALLOWED has been set previously */
return !(vm_flags & VM_MTE) || (vm_flags & VM_MTE_ALLOWED);
 }
-#define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags)
+#define arch_validate_flags(vm_flags, is_anon) arch_validate_flags(vm_flags, 
is_anon)
 
 #endif /* ! __ASM_MMAN_H__ */
diff --git a/arch/sparc/include/asm/mman.h b/arch/sparc/include/asm/mman.h
index 274217e7ed70..4a897c8a3f1a 100644
--- a/arch/sparc/include/asm/mman.h
+++ b/arch/sparc/include/asm/mman.h
@@ -60,11 +60,11 @@ static inline int sparc_validate_prot(unsigned long prot, 
unsigned long addr)
return 1;
 }
 
-#define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags)
+#define arch_validate_flags(vm_flags, is_anon) arch_validate_flags(vm_flags, 
is_anon)
 /* arch_validate_flags() - Ensure combination of flags is valid for a
  * VMA.
  */
-static inline bool arch_validate_flags(unsigned long vm_flags)
+static inline bool arch_validate_flags(unsigned long vm_flags, bool is_anon)
 {
/* If ADI is being enabled on this VMA, check for ADI
 * capability on the platform and ensure VMA is suitable
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 629cefc4ecba..a22ed4495d13 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -114,7 +114,7 @@ static inline bool arch_validate_prot(unsigned long prot, 
unsigned long addr)
  *
  * Returns true if the VM_* flags are valid.
  */
-static inline bool arch_validate_flags(unsigned long flags)
+static inline bool arch_validate_flags(unsigned long flags, bool is_anonymous)
 {
return true;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b2992ef8ee0..db849e3ed9d3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1850,7 +1850,7 @@ unsigned long mmap_region(struct file *file, unsigned 
long addr,
}
 
/* Allow architectures to sanity-check the vm_flags */
-   if (!arch_validate_flags(vma->vm_flags)) {
+   if (!arch_validate_flags(vma->vm_flags, vma_is_anonymous(vma))) {
error = -EINVAL;
if (file)
goto unmap_and_free_vma;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 3b2f0d75519f..64378b963548 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -611,7 +611,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
}
 
/* Allow architectures to sanity-check the new flags */
-   if (!arch_validate_flags(newflags)) {
+   if (!arch_validate_flags(newflags, vma_is_anonymous(vma))) {
error = -EINVAL;
goto out;
}
-- 
2.21.0



[PATCH v25 2/9] x86/cet/ibt: Add user-mode Indirect Branch Tracking support

2021-04-15 Thread Yu-cheng Yu
Introduce user-mode Indirect Branch Tracking (IBT) support.  Add routines
for the setup/disable of IBT.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Move IBT routines to a separate ibt.c, update related areas accordingly.

 arch/x86/include/asm/cet.h |  9 ++
 arch/x86/kernel/Makefile   |  1 +
 arch/x86/kernel/ibt.c  | 57 ++
 3 files changed, 67 insertions(+)
 create mode 100644 arch/x86/kernel/ibt.c

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 662335ceb57f..17afcc9ea4d1 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -15,6 +15,7 @@ struct cet_status {
unsigned long   shstk_base;
unsigned long   shstk_size;
unsigned intlocked:1;
+   unsigned intibt_enabled:1;
 };
 
 #ifdef CONFIG_X86_SHADOW_STACK
@@ -41,6 +42,14 @@ static inline int shstk_check_rstor_token(bool ia32, 
unsigned long token_addr,
  unsigned long *new_ssp) { return 0; }
 #endif
 
+#ifdef CONFIG_X86_IBT
+int ibt_setup(void);
+void ibt_disable(void);
+#else
+static inline int ibt_setup(void) { return 0; }
+static inline void ibt_disable(void) {}
+#endif
+
 #ifdef CONFIG_X86_SHADOW_STACK
 int prctl_cet(int option, u64 arg2);
 #else
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index eb13d578ad36..e10e007c1d80 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -151,6 +151,7 @@ obj-$(CONFIG_UNWINDER_GUESS)+= 
unwind_guess.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += sev-es.o
 obj-$(CONFIG_X86_SHADOW_STACK) += shstk.o cet_prctl.o
+obj-$(CONFIG_X86_IBT)  += ibt.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/ibt.c b/arch/x86/kernel/ibt.c
new file mode 100644
index ..d2cef1a0345b
--- /dev/null
+++ b/arch/x86/kernel/ibt.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ibt.c - Intel Indirect Branch Tracking support
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ * Yu-cheng Yu 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static void start_update_msrs(void)
+{
+   fpregs_lock();
+   if (test_thread_flag(TIF_NEED_FPU_LOAD))
+   __fpregs_load_activate();
+}
+
+static void end_update_msrs(void)
+{
+   fpregs_unlock();
+}
+
+int ibt_setup(void)
+{
+   u64 msr_val;
+
+   if (!cpu_feature_enabled(X86_FEATURE_IBT))
+   return -EOPNOTSUPP;
+
+   start_update_msrs();
+   rdmsrl(MSR_IA32_U_CET, msr_val);
+   msr_val |= (CET_ENDBR_EN | CET_NO_TRACK_EN);
+   wrmsrl(MSR_IA32_U_CET, msr_val);
+   end_update_msrs();
+   current->thread.cet.ibt_enabled = 1;
+   return 0;
+}
+
+void ibt_disable(void)
+{
+   u64 msr_val;
+
+   if (!cpu_feature_enabled(X86_FEATURE_IBT))
+   return;
+
+   start_update_msrs();
+   rdmsrl(MSR_IA32_U_CET, msr_val);
+   msr_val &= ~CET_ENDBR_EN;
+   wrmsrl(MSR_IA32_U_CET, msr_val);
+   end_update_msrs();
+   current->thread.cet.ibt_enabled = 0;
+}
-- 
2.21.0



[PATCH v25 0/9] Control-flow Enforcement: Indirect Branch Tracking

2021-04-15 Thread Yu-cheng Yu
Control-flow Enforcement (CET) is a new Intel processor feature that blocks
return/jump-oriented programming attacks.  Details are in "Intel 64 and
IA-32 Architectures Software Developer's Manual" [1].

This is the second part of CET and enables Indirect Branch Tracking (IBT).
It is built on top of the shadow stack series.

Changes in v25:
- Make updates to Kconfig and CPU feature flags for the removal of Kconfig
  X86_CET and software-defined X86_FEATURE_CET.
- Update ENDBR definition.
- Rebase to Linus tree v5.12-rc7.

[1] Intel 64 and IA-32 Architectures Software Developer's Manual:

https://software.intel.com/en-us/download/intel-64-and-ia-32-
architectures-sdm-combined-volumes-1-2a-2b-2c-2d-3a-3b-3c-3d-and-4

[2] Indirect Branch Tracking patches v24:

https://lore.kernel.org/r/20210401221403.32253-1-yu-cheng...@intel.com/

H.J. Lu (3):
  x86/cet/ibt: Update arch_prctl functions for Indirect Branch Tracking
  x86/vdso: Insert endbr32/endbr64 to vDSO
  x86/vdso/32: Add ENDBR to __kernel_vsyscall entry point

Yu-cheng Yu (6):
  x86/cet/ibt: Add Kconfig option for Indirect Branch Tracking
  x86/cet/ibt: Add user-mode Indirect Branch Tracking support
  x86/cet/ibt: Handle signals for Indirect Branch Tracking
  x86/cet/ibt: Update ELF header parsing for Indirect Branch Tracking
  x86/vdso: Introduce ENDBR macro
  x86/vdso: Add ENDBR to __vdso_sgx_enter_enclave

 arch/x86/Kconfig | 21 +
 arch/x86/entry/vdso/Makefile |  4 ++
 arch/x86/entry/vdso/vdso32/system_call.S |  2 +
 arch/x86/entry/vdso/vsgx.S   |  4 ++
 arch/x86/include/asm/cet.h   |  9 
 arch/x86/include/asm/disabled-features.h |  8 +++-
 arch/x86/include/asm/vdso.h  | 20 -
 arch/x86/include/uapi/asm/sigcontext.h   |  1 +
 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/cet_prctl.c  |  5 +++
 arch/x86/kernel/fpu/signal.c | 33 --
 arch/x86/kernel/ibt.c| 57 
 arch/x86/kernel/process_64.c |  8 
 13 files changed, 168 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/kernel/ibt.c

-- 
2.21.0



[PATCH v25 1/9] x86/cet/ibt: Add Kconfig option for Indirect Branch Tracking

2021-04-15 Thread Yu-cheng Yu
Indirect Branch Tracking (IBT) provides protection against CALL-/JMP-
oriented programming attacks.  It is active when the kernel has this
feature enabled, and the processor and the application support it.
When this feature is enabled, legacy non-IBT applications continue to
work, but without IBT protection.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v25:
- Make CONFIG_X86_IBT depend on CONFIG_X86_SHADOW_STACK.

 arch/x86/Kconfig | 19 +++
 arch/x86/include/asm/disabled-features.h |  8 +++-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 77d2e44995d7..6bb69fba0dad 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1965,6 +1965,25 @@ config X86_SHADOW_STACK
 
  If unsure, say N.
 
+config X86_IBT
+   prompt "Intel Indirect Branch Tracking"
+   def_bool n
+   depends on X86_SHADOW_STACK
+   depends on $(cc-option,-fcf-protection)
+   help
+ Indirect Branch Tracking (IBT) provides protection against
+ CALL-/JMP-oriented programming attacks.  It is active when
+ the kernel has this feature enabled, and the processor and
+ the application support it.  When this feature is enabled,
+ legacy non-IBT applications continue to work, but without
+ IBT protection.
+ Support for this feature is present on Tiger Lake family of
+ processors released in 2020 or later.  Enabling this feature
+ increases kernel text size by 3.7 KB.
+ See Documentation/x86/intel_cet.rst for more information.
+
+ If unsure, say N.
+
 config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/include/asm/disabled-features.h 
b/arch/x86/include/asm/disabled-features.h
index e5c6ed9373e8..07cc40d49947 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -74,6 +74,12 @@
 #define DISABLE_SHSTK  (1 << (X86_FEATURE_SHSTK & 31))
 #endif
 
+#ifdef CONFIG_X86_IBT
+#define DISABLE_IBT0
+#else
+#define DISABLE_IBT(1 << (X86_FEATURE_IBT & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -96,7 +102,7 @@
 #define DISABLED_MASK16
(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
 DISABLE_ENQCMD|DISABLE_SHSTK)
 #define DISABLED_MASK170
-#define DISABLED_MASK180
+#define DISABLED_MASK18(DISABLE_IBT)
 #define DISABLED_MASK190
 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
 
-- 
2.21.0



[PATCH v25 27/30] x86/cet/shstk: Add arch_prctl functions for shadow stack

2021-04-15 Thread Yu-cheng Yu
arch_prctl(ARCH_X86_CET_STATUS, u64 *args)
Get CET feature status.

The parameter 'args' is a pointer to a user buffer.  The kernel returns
the following information:

*args = shadow stack/IBT status
*(args + 1) = shadow stack base address
*(args + 2) = shadow stack size

32-bit binaries use the same interface, but only lower 32-bits of each
item.

arch_prctl(ARCH_X86_CET_DISABLE, unsigned int features)
Disable CET features specified in 'features'.  Return -EPERM if CET is
locked.

arch_prctl(ARCH_X86_CET_LOCK)
Lock in CET features.

Also change do_arch_prctl_common()'s parameter 'cpuid_enabled' to
'arg2', as it is now also passed to prctl_cet().

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
v25:
- Change CONFIG_X86_CET to CONFIG_X86_SHADOW_STACK.
- Change X86_FEATURE_CET to X86_FEATURE_SHSTK.
v24:
- Update #ifdef placement relating to shadow stack and ibt split.
- Update function names.

 arch/x86/include/asm/cet.h|  7 
 arch/x86/include/uapi/asm/prctl.h |  4 +++
 arch/x86/kernel/Makefile  |  2 +-
 arch/x86/kernel/cet_prctl.c   | 60 +++
 arch/x86/kernel/process.c |  6 ++--
 5 files changed, 75 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/kernel/cet_prctl.c

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 5e66919bd2fe..662335ceb57f 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -14,6 +14,7 @@ struct sc_ext;
 struct cet_status {
unsigned long   shstk_base;
unsigned long   shstk_size;
+   unsigned intlocked:1;
 };
 
 #ifdef CONFIG_X86_SHADOW_STACK
@@ -40,6 +41,12 @@ static inline int shstk_check_rstor_token(bool ia32, 
unsigned long token_addr,
  unsigned long *new_ssp) { return 0; }
 #endif
 
+#ifdef CONFIG_X86_SHADOW_STACK
+int prctl_cet(int option, u64 arg2);
+#else
+static inline int prctl_cet(int option, u64 arg2) { return -EINVAL; }
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_CET_H */
diff --git a/arch/x86/include/uapi/asm/prctl.h 
b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..9245bf629120 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -14,4 +14,8 @@
 #define ARCH_MAP_VDSO_32   0x2002
 #define ARCH_MAP_VDSO_64   0x2003
 
+#define ARCH_X86_CET_STATUS0x3001
+#define ARCH_X86_CET_DISABLE   0x3002
+#define ARCH_X86_CET_LOCK  0x3003
+
 #endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f99b093f350..eb13d578ad36 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -150,7 +150,7 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER)+= 
unwind_frame.o
 obj-$(CONFIG_UNWINDER_GUESS)   += unwind_guess.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += sev-es.o
-obj-$(CONFIG_X86_SHADOW_STACK) += shstk.o
+obj-$(CONFIG_X86_SHADOW_STACK) += shstk.o cet_prctl.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
new file mode 100644
index ..3bb9f32ca70d
--- /dev/null
+++ b/arch/x86/kernel/cet_prctl.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* See Documentation/x86/intel_cet.rst. */
+
+static int cet_copy_status_to_user(struct cet_status *cet, u64 __user *ubuf)
+{
+   u64 buf[3] = {};
+
+   if (cet->shstk_size) {
+   buf[0] |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+   buf[1] = cet->shstk_base;
+   buf[2] = cet->shstk_size;
+   }
+
+   return copy_to_user(ubuf, buf, sizeof(buf));
+}
+
+int prctl_cet(int option, u64 arg2)
+{
+   struct cet_status *cet;
+
+   if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+   return -ENOTSUPP;
+
+   cet = ¤t->thread.cet;
+
+   if (option == ARCH_X86_CET_STATUS)
+   return cet_copy_status_to_user(cet, (u64 __user *)arg2);
+
+   switch (option) {
+   case ARCH_X86_CET_DISABLE:
+   if (cet->locked)
+   return -EPERM;
+
+   if (arg2 & ~GNU_PROPERTY_X86_FEATURE_1_VALID)
+   return -EINVAL;
+   if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+   shstk_disable();
+   return 0;
+
+   case ARCH_X86_CET_LOCK:
+   if (arg2)
+   return -EINVAL;
+   cet->locked = 1;
+   return 0;
+
+   default:
+   return -ENOSYS;
+   }
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index fa01e8679d01..315668a334fd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -980,14 +980,14 @@ unsigned long get_wchan(struct t

[PATCH v25 26/30] ELF: Introduce arch_setup_elf_property()

2021-04-15 Thread Yu-cheng Yu
An ELF file's .note.gnu.property indicates arch features supported by the
file.  These features are extracted by arch_parse_elf_property() and stored
in 'arch_elf_state'.

Introduce x86 feature definitions and arch_setup_elf_property(), which
enables such features.  The first use-case of this function is Shadow
Stack.

ARM64 is the other arch that has ARCH_USE_GNU_PROPERTY and arch_parse_elf_
property().  Add arch_setup_elf_property() for it.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Cc: Mark Brown 
Cc: Catalin Marinas 
Cc: Dave Martin 
---
v24:
- Change cet_setup_shstk() to shstk_setup() to reflect function name changes
  relating to the splitting of shadow stack and ibt.

 arch/arm64/include/asm/elf.h |  5 +
 arch/x86/Kconfig |  2 ++
 arch/x86/include/asm/elf.h   | 13 +
 arch/x86/kernel/process_64.c | 32 
 fs/binfmt_elf.c  |  4 
 include/linux/elf.h  |  6 ++
 include/uapi/linux/elf.h |  9 +
 7 files changed, 71 insertions(+)

diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 8d1c8dcb87fd..d37bc7915935 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -281,6 +281,11 @@ static inline int arch_parse_elf_property(u32 type, const 
void *data,
return 0;
 }
 
+static inline int arch_setup_elf_property(struct arch_elf_state *arch)
+{
+   return 0;
+}
+
 static inline int arch_elf_pt_proc(void *ehdr, void *phdr,
   struct file *f, bool is_interp,
   struct arch_elf_state *state)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 41283f82fd87..77d2e44995d7 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1951,6 +1951,8 @@ config X86_SHADOW_STACK
depends on AS_WRUSS
depends on ARCH_HAS_SHADOW_STACK
select ARCH_USES_HIGH_VMA_FLAGS
+   select ARCH_USE_GNU_PROPERTY
+   select ARCH_BINFMT_ELF_STATE
help
  Shadow Stack protection is a hardware feature that detects function
  return address corruption.  This helps mitigate ROP attacks.
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9224d40cdefe..6a131047be8a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -390,6 +390,19 @@ extern int compat_arch_setup_additional_pages(struct 
linux_binprm *bprm,
 
 extern bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs);
 
+#ifdef CONFIG_ARCH_BINFMT_ELF_STATE
+struct arch_elf_state {
+   unsigned int gnu_property;
+};
+
+#define INIT_ARCH_ELF_STATE {  \
+   .gnu_property = 0,  \
+}
+
+#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) (0)
+#define arch_check_elf(ehdr, interp, interp_ehdr, state) (0)
+#endif
+
 /* Do not change the values. See get_align_mask() */
 enum align_flags {
ALIGN_VA_32 = BIT(0),
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d08307df69ad..d71045b29475 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -835,3 +835,35 @@ unsigned long KSTK_ESP(struct task_struct *task)
 {
return task_pt_regs(task)->sp;
 }
+
+#ifdef CONFIG_ARCH_USE_GNU_PROPERTY
+int arch_parse_elf_property(u32 type, const void *data, size_t datasz,
+   bool compat, struct arch_elf_state *state)
+{
+   if (type != GNU_PROPERTY_X86_FEATURE_1_AND)
+   return 0;
+
+   if (datasz != sizeof(unsigned int))
+   return -ENOEXEC;
+
+   state->gnu_property = *(unsigned int *)data;
+   return 0;
+}
+
+int arch_setup_elf_property(struct arch_elf_state *state)
+{
+   int r = 0;
+
+   if (!IS_ENABLED(CONFIG_X86_SHADOW_STACK))
+   return r;
+
+   memset(¤t->thread.cet, 0, sizeof(struct cet_status));
+
+   if (static_cpu_has(X86_FEATURE_SHSTK)) {
+   if (state->gnu_property & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+   r = shstk_setup();
+   }
+
+   return r;
+}
+#endif
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b12ba98ae9f5..fa665eceba04 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1248,6 +1248,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
set_binfmt(&elf_format);
 
+   retval = arch_setup_elf_property(&arch_state);
+   if (retval < 0)
+   goto out;
+
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter);
if (retval < 0)
diff --git a/include/linux/elf.h b/include/linux/elf.h
index c9a46c4e183b..be04d15e937f 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -92,9 +92,15 @@ static inline int arch_parse_elf_property(u32 type, const 
void *data,
 {
return 0;
 }
+
+static inline int arch_setup_elf_property(struct arch_elf_state *arch)
+{
+   return 0;
+}
 #else
 extern int arch_

[PATCH v25 25/30] x86/cet/shstk: Handle signals for shadow stack

2021-04-15 Thread Yu-cheng Yu
When shadow stack is enabled, a task's shadow stack states must be saved
along with the signal context and later restored in sigreturn.  However,
currently there is no systematic facility for extending a signal context.
There is some space left in the ucontext, but changing ucontext is likely
to create compatibility issues and there is not enough space for further
extensions.

Introduce a signal context extension struct 'sc_ext', which is used to save
shadow stack restore token address.  The extension is located above the fpu
states, plus alignment.  The struct can be extended (such as the ibt's
wait_endbr status to be introduced later), and sc_ext.total_size field
keeps track of total size.

Introduce routines for the allocation, save, and restore for sc_ext:
- fpu__alloc_sigcontext_ext(),
- save_extra_state_to_sigframe(),
- get_extra_state_from_sigframe(),
- restore_extra_state_to_xregs().

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v25:
- Update commit log/comments for the sc_ext struct.
- Use restorer address already calculated.
- Change CONFIG_X86_CET to CONFIG_X86_SHADOW_STACK.
- Change X86_FEATURE_CET to X86_FEATURE_SHSTK.
- Eliminate writing to MSR_IA32_U_CET for shadow stack.
- Change wrmsrl() to wrmsrl_safe() and handle error.

v24:
- Split out shadow stack token routines to a separate patch.
- Put signal frame save/restore routines to fpu/signal.c and re-name 
accordingly.

 arch/x86/ia32/ia32_signal.c|  24 +++--
 arch/x86/include/asm/cet.h |   2 +
 arch/x86/include/asm/fpu/internal.h|   2 +
 arch/x86/include/uapi/asm/sigcontext.h |   9 ++
 arch/x86/kernel/fpu/signal.c   | 137 -
 arch/x86/kernel/signal.c   |   9 ++
 6 files changed, 172 insertions(+), 11 deletions(-)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 5e3d9b7fd5fb..423abcd181f2 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -202,7 +202,8 @@ do {
\
  */
 static void __user *get_sigframe(struct ksignal *ksig, struct pt_regs *regs,
 size_t frame_size,
-void __user **fpstate)
+void __user **fpstate,
+void __user *restorer)
 {
unsigned long sp, fx_aligned, math_size;
 
@@ -220,6 +221,10 @@ static void __user *get_sigframe(struct ksignal *ksig, 
struct pt_regs *regs,
 
sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
*fpstate = (struct _fpstate_32 __user *) sp;
+
+   if (save_extra_state_to_sigframe(1, *fpstate, restorer))
+   return (void __user *)-1L;
+
if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
 math_size) < 0)
return (void __user *) -1L;
@@ -249,8 +254,6 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
0x80cd, /* int $0x80 */
};
 
-   frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
-
if (ksig->ka.sa.sa_flags & SA_RESTORER) {
restorer = ksig->ka.sa.sa_restorer;
} else {
@@ -262,6 +265,8 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
restorer = &frame->retcode;
}
 
+   frame = get_sigframe(ksig, regs, sizeof(*frame), &fp, restorer);
+
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
 
@@ -317,7 +322,13 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
0,
};
 
-   frame = get_sigframe(ksig, regs, sizeof(*frame), &fp);
+   if (ksig->ka.sa.sa_flags & SA_RESTORER)
+   restorer = ksig->ka.sa.sa_restorer;
+   else
+   restorer = current->mm->context.vdso +
+   vdso_image_32.sym___kernel_rt_sigreturn;
+
+   frame = get_sigframe(ksig, regs, sizeof(*frame), &fp, restorer);
 
if (!user_access_begin(frame, sizeof(*frame)))
return -EFAULT;
@@ -334,11 +345,6 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
unsafe_put_user(0, &frame->uc.uc_link, Efault);
unsafe_compat_save_altstack(&frame->uc.uc_stack, regs->sp, Efault);
 
-   if (ksig->ka.sa.sa_flags & SA_RESTORER)
-   restorer = ksig->ka.sa.sa_restorer;
-   else
-   restorer = current->mm->context.vdso +
-   vdso_image_32.sym___kernel_rt_sigreturn;
unsafe_put_user(ptr_to_compat(restorer), &frame->pretcode, Efault);
 
/*
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index ef6155213b7e..5e66919bd2fe 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -6,6 +6,8 @@
 #include 
 
 struct task_struct;
+struct s

[PATCH v25 24/30] x86/cet/shstk: Introduce shadow stack token setup/verify routines

2021-04-15 Thread Yu-cheng Yu
A shadow stack restore token marks a restore point of the shadow stack, and
the address in a token must point directly above the token, which is within
the same shadow stack.  This is distinctively different from other pointers
on the shadow stack, since those pointers point to executable code area.

The restore token can be used as an extra protection for signal handling.
To deliver a signal, create a shadow stack restore token and put the token
and the signal restorer address on the shadow stack.  In sigreturn, verify
the token and restore from it the shadow stack pointer.

Introduce token setup and verify routines.  Also introduce WRUSS, which is
a kernel-mode instruction but writes directly to user shadow stack.  It is
used to construct user signal stack as described above.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v25:
- Update inline assembly syntax, use %[].
- Change token address from (unsigned long) to (u64/u32 __user *).
- Change -EPERM to -EFAULT.

 arch/x86/include/asm/cet.h   |   9 ++
 arch/x86/include/asm/special_insns.h |  32 +++
 arch/x86/kernel/shstk.c  | 126 +++
 3 files changed, 167 insertions(+)

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 8b83ded577cc..ef6155213b7e 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -20,6 +20,10 @@ int shstk_setup_thread(struct task_struct *p, unsigned long 
clone_flags,
   unsigned long stack_size);
 void shstk_free(struct task_struct *p);
 void shstk_disable(void);
+int shstk_setup_rstor_token(bool ia32, unsigned long rstor,
+   unsigned long *token_addr, unsigned long *new_ssp);
+int shstk_check_rstor_token(bool ia32, unsigned long token_addr,
+   unsigned long *new_ssp);
 #else
 static inline int shstk_setup(void) { return 0; }
 static inline int shstk_setup_thread(struct task_struct *p,
@@ -27,6 +31,11 @@ static inline int shstk_setup_thread(struct task_struct *p,
 unsigned long stack_size) { return 0; }
 static inline void shstk_free(struct task_struct *p) {}
 static inline void shstk_disable(void) {}
+static inline int shstk_setup_rstor_token(bool ia32, unsigned long rstor,
+ unsigned long *token_addr,
+ unsigned long *new_ssp) { return 0; }
+static inline int shstk_check_rstor_token(bool ia32, unsigned long token_addr,
+ unsigned long *new_ssp) { return 0; }
 #endif
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index 1d3cbaef4bb7..5a0488923cae 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -234,6 +234,38 @@ static inline void clwb(volatile void *__p)
: [pax] "a" (p));
 }
 
+#ifdef CONFIG_X86_SHADOW_STACK
+#if defined(CONFIG_IA32_EMULATION) || defined(CONFIG_X86_X32)
+static inline int write_user_shstk_32(u32 __user *addr, u32 val)
+{
+   asm_volatile_goto("1: wrussd %[val], (%[addr])\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: [addr] "r" (addr), [val] "r" (val)
+ :: fail);
+   return 0;
+fail:
+   return -EFAULT;
+}
+#else
+static inline int write_user_shstk_32(u32 __user *addr, u32 val)
+{
+   WARN_ONCE(1, "%s used but not supported.\n", __func__);
+   return -EFAULT;
+}
+#endif
+
+static inline int write_user_shstk_64(u64 __user *addr, u64 val)
+{
+   asm_volatile_goto("1: wrussq %[val], (%[addr])\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: [addr] "r" (addr), [val] "r" (val)
+ :: fail);
+   return 0;
+fail:
+   return -EFAULT;
+}
+#endif /* CONFIG_X86_SHADOW_STACK */
+
 #define nop() asm volatile ("nop")
 
 static inline void serialize(void)
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index d387df84b7f1..48a0c87414ef 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static void start_update_msrs(void)
 {
@@ -176,3 +177,128 @@ void shstk_disable(void)
 
shstk_free(current);
 }
+
+static unsigned long _get_user_shstk_addr(void)
+{
+   struct fpu *fpu = ¤t->thread.fpu;
+   unsigned long ssp = 0;
+
+   fpregs_lock();
+
+   if (fpregs_state_valid(fpu, smp_processor_id())) {
+   rdmsrl(MSR_IA32_PL3_SSP, ssp);
+   } else {
+   struct cet_user_state *p;
+
+   p = get_xsave_addr(&fpu->state.xsave, XFEATURE_CET_USER);
+   if (p)
+   ssp = p->user_ssp;
+   }
+
+   fpregs_unlock();
+   return ssp;
+}
+
+#define TOKEN_M

[PATCH v25 23/30] x86/cet/shstk: Handle thread shadow stack

2021-04-15 Thread Yu-cheng Yu
For clone() with CLONE_VM specified, the child and the parent must have
separate shadow stacks.  Thus, the kernel allocates, and frees on thread
exit a new shadow stack for the child.

Use stack_size passed from clone3() syscall for thread shadow stack size,
but cap it to min(RLIMIT_STACK, 4 GB).  A compat-mode thread shadow stack
size is further reduced to 1/4.  This allows more threads to run in a 32-
bit address space.

Signed-off-by: Yu-cheng Yu 
---
 arch/x86/include/asm/cet.h |  5 +++
 arch/x86/include/asm/mmu_context.h |  3 ++
 arch/x86/kernel/process.c  | 15 ++--
 arch/x86/kernel/shstk.c| 57 +-
 4 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index aa85d599b184..8b83ded577cc 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -16,10 +16,15 @@ struct cet_status {
 
 #ifdef CONFIG_X86_SHADOW_STACK
 int shstk_setup(void);
+int shstk_setup_thread(struct task_struct *p, unsigned long clone_flags,
+  unsigned long stack_size);
 void shstk_free(struct task_struct *p);
 void shstk_disable(void);
 #else
 static inline int shstk_setup(void) { return 0; }
+static inline int shstk_setup_thread(struct task_struct *p,
+unsigned long clone_flags,
+unsigned long stack_size) { return 0; }
 static inline void shstk_free(struct task_struct *p) {}
 static inline void shstk_disable(void) {}
 #endif
diff --git a/arch/x86/include/asm/mmu_context.h 
b/arch/x86/include/asm/mmu_context.h
index 27516046117a..53569114aa01 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -11,6 +11,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 extern atomic64_t last_mm_ctx_id;
@@ -146,6 +147,8 @@ do {\
 #else
 #define deactivate_mm(tsk, mm) \
 do {   \
+   if (!tsk->vfork_done)   \
+   shstk_free(tsk);\
load_gs_index(0);   \
loadsegment(fs, 0); \
 } while (0)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9c214d7085a4..fa01e8679d01 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "process.h"
 
@@ -109,6 +110,7 @@ void exit_thread(struct task_struct *tsk)
 
free_vm86(t);
 
+   shstk_free(tsk);
fpu__drop(fpu);
 }
 
@@ -122,8 +124,9 @@ static int set_new_tls(struct task_struct *p, unsigned long 
tls)
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
-   struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long sp,
+   unsigned long stack_size, struct task_struct *p,
+   unsigned long tls)
 {
struct inactive_task_frame *frame;
struct fork_frame *fork_frame;
@@ -163,7 +166,7 @@ int copy_thread(unsigned long clone_flags, unsigned long 
sp, unsigned long arg,
/* Kernel thread ? */
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
-   kthread_frame_init(frame, sp, arg);
+   kthread_frame_init(frame, sp, stack_size);
return 0;
}
 
@@ -181,6 +184,12 @@ int copy_thread(unsigned long clone_flags, unsigned long 
sp, unsigned long arg,
if (clone_flags & CLONE_SETTLS)
ret = set_new_tls(p, tls);
 
+#ifdef CONFIG_X86_64
+   /* Allocate a new shadow stack for pthread */
+   if (!ret)
+   ret = shstk_setup_thread(p, clone_flags, stack_size);
+#endif
+
if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
io_bitmap_share(p);
 
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index c815c7507830..d387df84b7f1 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -70,6 +70,55 @@ int shstk_setup(void)
return 0;
 }
 
+int shstk_setup_thread(struct task_struct *tsk, unsigned long clone_flags,
+  unsigned long stack_size)
+{
+   unsigned long addr, size;
+   struct cet_user_state *state;
+   struct cet_status *cet = &tsk->thread.cet;
+
+   if (!cet->shstk_size)
+   return 0;
+
+   if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM)
+   return 0;
+
+   state = get_xsave_addr(&tsk->thread.fpu.state.xsave,
+  XFEATURE_CET_USER);
+
+   if (!state)
+   return -EINVAL;
+
+   if (stack_size == 0)
+   retur

[PATCH v25 21/30] mm: Re-introduce vm_flags to do_mmap()

2021-04-15 Thread Yu-cheng Yu
There was no more caller passing vm_flags to do_mmap(), and vm_flags was
removed from the function's input by:

commit 45e55300f114 ("mm: remove unnecessary wrapper function 
do_mmap_pgoff()").

There is a new user now.  Shadow stack allocation passes VM_SHADOW_STACK to
do_mmap().  Thus, re-introduce vm_flags to do_mmap().

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Peter Collingbourne 
Reviewed-by: Kees Cook 
Cc: Andrew Morton 
Cc: Oleg Nesterov 
Cc: linux...@kvack.org
---
v24:
- Change VM_SHSTK to VM_SHADOW_STACK.
- Update commit log.

 fs/aio.c   |  2 +-
 include/linux/mm.h |  3 ++-
 ipc/shm.c  |  2 +-
 mm/mmap.c  | 10 +-
 mm/nommu.c |  4 ++--
 mm/util.c  |  2 +-
 6 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 1f32da13d39e..b5d0586209a7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -529,7 +529,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int 
nr_events)
 
ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size,
 PROT_READ | PROT_WRITE,
-MAP_SHARED, 0, &unused, NULL);
+MAP_SHARED, 0, 0, &unused, NULL);
mmap_write_unlock(mm);
if (IS_ERR((void *)ctx->mmap_base)) {
ctx->mmap_size = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3e9c84f21ef6..1ccec5cc399b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2576,7 +2576,8 @@ extern unsigned long mmap_region(struct file *file, 
unsigned long addr,
struct list_head *uf);
 extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
-   unsigned long pgoff, unsigned long *populate, struct list_head *uf);
+   vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
+   struct list_head *uf);
 extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
   struct list_head *uf, bool downgrade);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t,
diff --git a/ipc/shm.c b/ipc/shm.c
index febd88daba8c..b6370eb1eaab 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1556,7 +1556,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
goto invalid;
}
 
-   addr = do_mmap(file, addr, size, prot, flags, 0, &populate, NULL);
+   addr = do_mmap(file, addr, size, prot, flags, 0, 0, &populate, NULL);
*raddr = addr;
err = 0;
if (IS_ERR_VALUE(addr))
diff --git a/mm/mmap.c b/mm/mmap.c
index d77fb39b6ab5..7b2992ef8ee0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1401,11 +1401,11 @@ static inline bool file_mmap_ok(struct file *file, 
struct inode *inode,
  */
 unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
-   unsigned long flags, unsigned long pgoff,
-   unsigned long *populate, struct list_head *uf)
+   unsigned long flags, vm_flags_t vm_flags,
+   unsigned long pgoff, unsigned long *populate,
+   struct list_head *uf)
 {
struct mm_struct *mm = current->mm;
-   vm_flags_t vm_flags;
int pkey = 0;
 
*populate = 0;
@@ -1467,7 +1467,7 @@ unsigned long do_mmap(struct file *file, unsigned long 
addr,
 * to. we assume access permissions have been handled by the open
 * of the memory object, so we don't do any here.
 */
-   vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+   vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
if (flags & MAP_LOCKED)
@@ -3047,7 +3047,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, 
unsigned long, size,
 
file = get_file(vma->vm_file);
ret = do_mmap(vma->vm_file, start, size,
-   prot, flags, pgoff, &populate, NULL);
+   prot, flags, 0, pgoff, &populate, NULL);
fput(file);
 out:
mmap_write_unlock(mm);
diff --git a/mm/nommu.c b/mm/nommu.c
index 5c9ab799c0e6..9b6f7a1895c2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1071,6 +1071,7 @@ unsigned long do_mmap(struct file *file,
unsigned long len,
unsigned long prot,
unsigned long flags,
+   vm_flags_t vm_flags,
unsigned long pgoff,
unsigned long *populate,
struct list_head *uf)
@@ -1078,7 +1079,6 @@ unsigned long do_mmap(struct file *file,
struct vm_area_struct *vma;
struct vm_region *region;
struct rb_node *rb;
-   vm_flags_t vm_flags

[PATCH v25 22/30] x86/cet/shstk: Add user-mode shadow stack support

2021-04-15 Thread Yu-cheng Yu
Introduce basic shadow stack enabling/disabling/allocation routines.
A task's shadow stack is allocated from memory with VM_SHADOW_STACK flag
and has a fixed size of min(RLIMIT_STACK, 4GB).

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v25:
- Change CONFIG_X86_CET to CONFIG_X86_SHADOW_STACK.
- Update alloc_shstk(), remove unused input flags.
v24:
- Rename cet.c to shstk.c, update related areas accordingly.

 arch/x86/include/asm/cet.h   |  29 
 arch/x86/include/asm/processor.h |   5 ++
 arch/x86/kernel/Makefile |   2 +
 arch/x86/kernel/shstk.c  | 123 +++
 4 files changed, 159 insertions(+)
 create mode 100644 arch/x86/include/asm/cet.h
 create mode 100644 arch/x86/kernel/shstk.c

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
new file mode 100644
index ..aa85d599b184
--- /dev/null
+++ b/arch/x86/include/asm/cet.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CET_H
+#define _ASM_X86_CET_H
+
+#ifndef __ASSEMBLY__
+#include 
+
+struct task_struct;
+/*
+ * Per-thread CET status
+ */
+struct cet_status {
+   unsigned long   shstk_base;
+   unsigned long   shstk_size;
+};
+
+#ifdef CONFIG_X86_SHADOW_STACK
+int shstk_setup(void);
+void shstk_free(struct task_struct *p);
+void shstk_disable(void);
+#else
+static inline int shstk_setup(void) { return 0; }
+static inline void shstk_free(struct task_struct *p) {}
+static inline void shstk_disable(void) {}
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_CET_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f1b9ed5efaa9..3690b78e55bb 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,6 +27,7 @@ struct vm86;
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -535,6 +536,10 @@ struct thread_struct {
 
unsigned intsig_on_uaccess_err:1;
 
+#ifdef CONFIG_X86_SHADOW_STACK
+   struct cet_status   cet;
+#endif
+
/* Floating point and extended processor state */
struct fpu  fpu;
/*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ddf08351f0b..0f99b093f350 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -150,6 +150,8 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER)+= 
unwind_frame.o
 obj-$(CONFIG_UNWINDER_GUESS)   += unwind_guess.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += sev-es.o
+obj-$(CONFIG_X86_SHADOW_STACK) += shstk.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
new file mode 100644
index ..c815c7507830
--- /dev/null
+++ b/arch/x86/kernel/shstk.c
@@ -0,0 +1,123 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * shstk.c - Intel shadow stack support
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ * Yu-cheng Yu 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static void start_update_msrs(void)
+{
+   fpregs_lock();
+   if (test_thread_flag(TIF_NEED_FPU_LOAD))
+   __fpregs_load_activate();
+}
+
+static void end_update_msrs(void)
+{
+   fpregs_unlock();
+}
+
+static unsigned long alloc_shstk(unsigned long size)
+{
+   struct mm_struct *mm = current->mm;
+   unsigned long addr, populate;
+   int flags = MAP_ANONYMOUS | MAP_PRIVATE;
+
+   mmap_write_lock(mm);
+   addr = do_mmap(NULL, 0, size, PROT_READ, flags, VM_SHADOW_STACK, 0,
+  &populate, NULL);
+   mmap_write_unlock(mm);
+
+   return addr;
+}
+
+int shstk_setup(void)
+{
+   unsigned long addr, size;
+   struct cet_status *cet = ¤t->thread.cet;
+
+   if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+   return -EOPNOTSUPP;
+
+   size = round_up(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G), 
PAGE_SIZE);
+   addr = alloc_shstk(size);
+   if (IS_ERR_VALUE(addr))
+   return PTR_ERR((void *)addr);
+
+   cet->shstk_base = addr;
+   cet->shstk_size = size;
+
+   start_update_msrs();
+   wrmsrl(MSR_IA32_PL3_SSP, addr + size);
+   wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
+   end_update_msrs();
+   return 0;
+}
+
+void shstk_free(struct task_struct *tsk)
+{
+   struct cet_status *cet = &tsk->thread.cet;
+
+   if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
+   !cet->shstk_size ||
+   !cet->shstk_base)
+   return;
+
+   if (!tsk->mm)
+   return;
+
+   while (1) {
+   int r;
+
+   r = vm_munmap(cet->shstk_base, cet->shstk_size);
+
+   /*
+* vm_munmap() returns -EINTR when mmap_lock is held by
+* something else, and that lock should not be held for a
+

[PATCH v25 20/30] mm/mprotect: Exclude shadow stack from preserve_write

2021-04-15 Thread Yu-cheng Yu
In change_pte_range(), when a PTE is changed for prot_numa, _PAGE_RW is
preserved to avoid the additional write fault after the NUMA hinting fault.
However, pte_write() now includes both normal writable and shadow stack
(RW=0, Dirty=1) PTEs, but the latter does not have _PAGE_RW and has no need
to preserve it.

Exclude shadow stack from preserve_write test, and apply the same change to
change_huge_pmd().

Signed-off-by: Yu-cheng Yu 
Cc: Kirill A. Shutemov 
---
v25:
- Move is_shadow_stack_mapping() to a separate line.
v24:
- Change arch_shadow_stack_mapping() to is_shadow_stack_mapping().

 mm/huge_memory.c | 7 +++
 mm/mprotect.c| 7 +++
 2 files changed, 14 insertions(+)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index a0858eac0320..9e51cae35aad 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1824,6 +1824,13 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
return 0;
 
preserve_write = prot_numa && pmd_write(*pmd);
+
+   /*
+* Preserve only normal writable huge PMD, but not shadow
+* stack (RW=0, Dirty=1).
+*/
+   if (is_shadow_stack_mapping(vma->vm_flags))
+   preserve_write = false;
ret = 1;
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c1ce78d688b6..3b2f0d75519f 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -77,6 +77,13 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
pte_t ptent;
bool preserve_write = prot_numa && pte_write(oldpte);
 
+   /*
+* Preserve only normal writable PTE, but not shadow
+* stack (RW=0, Dirty=1).
+*/
+   if (is_shadow_stack_mapping(vma->vm_flags))
+   preserve_write = false;
+
/*
 * Avoid trapping faults against the zero or KSM
 * pages. See similar comment in change_huge_pmd.
-- 
2.21.0



[PATCH v25 19/30] mm: Update can_follow_write_pte() for shadow stack

2021-04-15 Thread Yu-cheng Yu
Can_follow_write_pte() ensures a read-only page is COWed by checking the
FOLL_COW flag, and uses pte_dirty() to validate the flag is still valid.

Like a writable data page, a shadow stack page is writable, and becomes
read-only during copy-on-write, but it is always dirty.  Thus, in the
can_follow_write_pte() check, it belongs to the writable page case and
should be excluded from the read-only page pte_dirty() check.  Apply
the same changes to can_follow_write_pmd().

While at it, also split the long line into smaller ones.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
Cc: Kirill A. Shutemov 
---
v25:
- Split long line into smaller ones.
v24:
- Change arch_shadow_stack_mapping() to is_shadow_stack_mapping().

 mm/gup.c | 16 
 mm/huge_memory.c | 16 
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index ef7d2da9f03f..f2813cf4d07b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -356,10 +356,18 @@ static int follow_pfn_pte(struct vm_area_struct *vma, 
unsigned long address,
  * FOLL_FORCE can write to even unwritable pte's, but only
  * after we've gone through a COW cycle and they are dirty.
  */
-static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
+static inline bool can_follow_write_pte(pte_t pte, unsigned int flags,
+   vm_flags_t vm_flags)
 {
-   return pte_write(pte) ||
-   ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+   if (pte_write(pte))
+   return true;
+   if ((flags & (FOLL_FORCE | FOLL_COW)) != (FOLL_FORCE | FOLL_COW))
+   return false;
+   if (!pte_dirty(pte))
+   return false;
+   if (is_shadow_stack_mapping(vm_flags))
+   return false;
+   return true;
 }
 
 static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -402,7 +410,7 @@ static struct page *follow_page_pte(struct vm_area_struct 
*vma,
}
if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
-   if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
+   if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags, 
vma->vm_flags)) {
pte_unmap_unlock(ptep, ptl);
return NULL;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 044029ef45cd..a0858eac0320 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1338,10 +1338,18 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, 
pmd_t orig_pmd)
  * FOLL_FORCE can write to even unwritable pmd's, but only
  * after we've gone through a COW cycle and they are dirty.
  */
-static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags,
+   vm_flags_t vm_flags)
 {
-   return pmd_write(pmd) ||
-  ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+   if (pmd_write(pmd))
+   return true;
+   if ((flags & (FOLL_FORCE | FOLL_COW)) != (FOLL_FORCE | FOLL_COW))
+   return false;
+   if (!pmd_dirty(pmd))
+   return false;
+   if (is_shadow_stack_mapping(vm_flags))
+   return false;
+   return true;
 }
 
 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1354,7 +1362,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct 
*vma,
 
assert_spin_locked(pmd_lockptr(mm, pmd));
 
-   if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
+   if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags, 
vma->vm_flags))
goto out;
 
/* Avoid dumping huge zero page */
-- 
2.21.0



[PATCH v25 18/30] mm/mmap: Add shadow stack pages to memory accounting

2021-04-15 Thread Yu-cheng Yu
Account shadow stack pages to stack memory.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
Cc: Kirill A. Shutemov 
---
v25:
- Remove #ifdef CONFIG_ARCH_HAS_SHADOW_STACK for is_shadow_stack_mapping().
v24:
- Change arch_shadow_stack_mapping() to is_shadow_stack_mapping().
- Change VM_SHSTK to VM_SHADOW_STACK.

 arch/x86/include/asm/pgtable.h | 3 +++
 arch/x86/mm/pgtable.c  | 5 +
 include/linux/pgtable.h| 9 +
 mm/mmap.c  | 5 +
 4 files changed, 22 insertions(+)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index da5dea417663..7f324edaedfa 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1692,6 +1692,9 @@ static inline bool arch_faults_on_old_pte(void)
 #define maybe_mkwrite maybe_mkwrite
 extern pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma);
 
+#define is_shadow_stack_mapping is_shadow_stack_mapping
+extern bool is_shadow_stack_mapping(vm_flags_t vm_flags);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e778dbbef3d8..69c0ef583c55 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -897,3 +897,8 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 
 #endif /* CONFIG_X86_64 */
 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+bool is_shadow_stack_mapping(vm_flags_t vm_flags)
+{
+   return (vm_flags & VM_SHADOW_STACK);
+}
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5e772392a379..45b601fa1a1c 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1446,6 +1446,15 @@ static inline bool arch_has_pfn_modify_check(void)
 }
 #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */
 
+#ifdef CONFIG_MMU
+#ifndef is_shadow_stack_mapping
+static inline bool is_shadow_stack_mapping(vm_flags_t vm_flags)
+{
+   return false;
+}
+#endif
+#endif /* CONFIG_MMU */
+
 /*
  * Architecture PAGE_KERNEL_* fallbacks
  *
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f287599a7a3..d77fb39b6ab5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1718,6 +1718,9 @@ static inline int accountable_mapping(struct file *file, 
vm_flags_t vm_flags)
if (file && is_file_hugepages(file))
return 0;
 
+   if (is_shadow_stack_mapping(vm_flags))
+   return 1;
+
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
@@ -3387,6 +3390,8 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t 
flags, long npages)
mm->stack_vm += npages;
else if (is_data_mapping(flags))
mm->data_vm += npages;
+   else if (is_shadow_stack_mapping(flags))
+   mm->stack_vm += npages;
 }
 
 static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
-- 
2.21.0



[PATCH v25 17/30] mm: Add guard pages around a shadow stack.

2021-04-15 Thread Yu-cheng Yu
INCSSP(Q/D) increments shadow stack pointer and 'pops and discards' the
first and the last elements in the range, effectively touches those memory
areas.

The maximum moving distance by INCSSPQ is 255 * 8 = 2040 bytes and
255 * 4 = 1020 bytes by INCSSPD.  Both ranges are far from PAGE_SIZE.
Thus, putting a gap page on both ends of a shadow stack prevents INCSSP,
CALL, and RET from going beyond.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kirill A. Shutemov 
Cc: Kees Cook 
---
v25:
- Move SHADOW_STACK_GUARD_GAP to arch/x86/mm/mmap.c.
v24:
- Instead changing vm_*_gap(), create x86-specific versions.

 arch/x86/include/asm/page_types.h |  7 +
 arch/x86/mm/mmap.c| 46 +++
 include/linux/mm.h|  4 +++
 3 files changed, 56 insertions(+)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index a506a411474d..e1533fdc08b4 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -73,6 +73,13 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned 
long end_pfn);
 
 extern void initmem_init(void);
 
+#define vm_start_gap vm_start_gap
+struct vm_area_struct;
+extern unsigned long vm_start_gap(struct vm_area_struct *vma);
+
+#define vm_end_gap vm_end_gap
+extern unsigned long vm_end_gap(struct vm_area_struct *vma);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index f3f52c5e2fd6..3f6455d14d18 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -250,3 +250,49 @@ bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
return false;
return true;
 }
+
+/*
+ * Shadow stack pointer is moved by CALL, RET, and INCSSP(Q/D).  INCSSPQ
+ * moves shadow stack pointer up to 255 * 8 = ~2 KB (~1KB for INCSSPD) and
+ * touches the first and the last element in the range, which triggers a
+ * page fault if the range is not in a shadow stack.  Because of this,
+ * creating 4-KB guard pages around a shadow stack prevents these
+ * instructions from going beyond.
+ */
+#define SHADOW_STACK_GUARD_GAP PAGE_SIZE
+
+unsigned long vm_start_gap(struct vm_area_struct *vma)
+{
+   unsigned long vm_start = vma->vm_start;
+   unsigned long gap = 0;
+
+   if (vma->vm_flags & VM_GROWSDOWN)
+   gap = stack_guard_gap;
+   else if (vma->vm_flags & VM_SHADOW_STACK)
+   gap = SHADOW_STACK_GUARD_GAP;
+
+   if (gap != 0) {
+   vm_start -= gap;
+   if (vm_start > vma->vm_start)
+   vm_start = 0;
+   }
+   return vm_start;
+}
+
+unsigned long vm_end_gap(struct vm_area_struct *vma)
+{
+   unsigned long vm_end = vma->vm_end;
+   unsigned long gap = 0;
+
+   if (vma->vm_flags & VM_GROWSUP)
+   gap = stack_guard_gap;
+   else if (vma->vm_flags & VM_SHADOW_STACK)
+   gap = SHADOW_STACK_GUARD_GAP;
+
+   if (gap != 0) {
+   vm_end += gap;
+   if (vm_end < vma->vm_end)
+   vm_end = -PAGE_SIZE;
+   }
+   return vm_end;
+}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6ac9b3e9a865..3e9c84f21ef6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2660,6 +2660,7 @@ static inline struct vm_area_struct * 
find_vma_intersection(struct mm_struct * m
return vma;
 }
 
+#ifndef vm_start_gap
 static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
 {
unsigned long vm_start = vma->vm_start;
@@ -2671,7 +2672,9 @@ static inline unsigned long vm_start_gap(struct 
vm_area_struct *vma)
}
return vm_start;
 }
+#endif
 
+#ifndef vm_end_gap
 static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
 {
unsigned long vm_end = vma->vm_end;
@@ -2683,6 +2686,7 @@ static inline unsigned long vm_end_gap(struct 
vm_area_struct *vma)
}
return vm_end;
 }
+#endif
 
 static inline unsigned long vma_pages(struct vm_area_struct *vma)
 {
-- 
2.21.0



[PATCH v25 16/30] mm: Fixup places that call pte_mkwrite() directly

2021-04-15 Thread Yu-cheng Yu
When serving a page fault, maybe_mkwrite() makes a PTE writable if it is in
a writable vma.  A shadow stack vma is writable, but its PTEs need
_PAGE_DIRTY to be set to become writable.  For this reason, maybe_mkwrite()
has been updated.

There are a few places that call pte_mkwrite() directly, but have the
same result as from maybe_mkwrite().  These sites need to be updated for
shadow stack as well.  Thus, change them to maybe_mkwrite():

- do_anonymous_page() and migrate_vma_insert_page() check VM_WRITE directly
  and call pte_mkwrite(), which is the same as maybe_mkwrite().  Change
  them to maybe_mkwrite().

- In do_numa_page(), if the numa entry was writable, then pte_mkwrite()
  is called directly.  Fix it by doing maybe_mkwrite().  Make the same
  changes to do_huge_pmd_numa_page().

- In change_pte_range(), pte_mkwrite() is called directly.  Replace it with
  maybe_mkwrite().

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
Cc: Kirill A. Shutemov 
---
v25:
- Apply same changes to do_huge_pmd_numa_page() as to do_numa_page().

 mm/huge_memory.c | 2 +-
 mm/memory.c  | 5 ++---
 mm/migrate.c | 3 +--
 mm/mprotect.c| 2 +-
 4 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8203bd6ae4bd..044029ef45cd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1553,7 +1553,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, 
pmd_t pmd)
pmd = pmd_modify(pmd, vma->vm_page_prot);
pmd = pmd_mkyoung(pmd);
if (was_writable)
-   pmd = pmd_mkwrite(pmd);
+   pmd = maybe_pmd_mkwrite(pmd, vma);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd);
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
unlock_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index 550405fc3b5e..0cb1028b1c02 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3561,8 +3561,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
__SetPageUptodate(page);
 
entry = mk_pte(page, vma->vm_page_prot);
-   if (vma->vm_flags & VM_WRITE)
-   entry = pte_mkwrite(pte_mkdirty(entry));
+   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
@@ -4125,7 +4124,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
pte = pte_modify(old_pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
-   pte = pte_mkwrite(pte);
+   pte = maybe_mkwrite(pte, vma);
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 62b81d5257aa..7251c88a3d64 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2976,8 +2976,7 @@ static void migrate_vma_insert_page(struct migrate_vma 
*migrate,
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
-   if (vma->vm_flags & VM_WRITE)
-   entry = pte_mkwrite(pte_mkdirty(entry));
+   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
}
 
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94188df1ee55..c1ce78d688b6 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -135,7 +135,7 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
if (dirty_accountable && pte_dirty(ptent) &&
(pte_soft_dirty(ptent) ||
 !(vma->vm_flags & VM_SOFTDIRTY))) {
-   ptent = pte_mkwrite(ptent);
+   ptent = maybe_mkwrite(ptent, vma);
}
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
pages++;
-- 
2.21.0



[PATCH v25 15/30] x86/mm: Update maybe_mkwrite() for shadow stack

2021-04-15 Thread Yu-cheng Yu
When serving a page fault, maybe_mkwrite() makes a PTE writable if its vma
has VM_WRITE.

A shadow stack vma has VM_SHADOW_STACK.  Its PTEs have _PAGE_DIRTY, but not
_PAGE_WRITE.  In fork(), _PAGE_DIRTY is cleared to cause copy-on-write,
and in the page fault handler, _PAGE_DIRTY is restored and the shadow stack
page is writable again.

Introduce an x86 version of maybe_mkwrite(), which sets proper PTE bits
according to VM flags.

Apply the same changes to maybe_pmd_mkwrite().

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kirill A. Shutemov 
Cc: Kees Cook 
---
v24:
- Instead of doing arch_maybe_mkwrite(), overwrite maybe*_mkwrite() with
  x86 versions.
- Change VM_SHSTK to VM_SHADOW_STACK.

 arch/x86/include/asm/pgtable.h |  6 ++
 arch/x86/mm/pgtable.c  | 20 
 include/linux/mm.h |  2 ++
 mm/huge_memory.c   |  2 ++
 4 files changed, 30 insertions(+)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 46d9394b884f..da5dea417663 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -308,6 +308,9 @@ static inline int pmd_trans_huge(pmd_t pmd)
return (pmd_val(pmd) & (_PAGE_PSE|_PAGE_DEVMAP)) == _PAGE_PSE;
 }
 
+#define maybe_pmd_mkwrite maybe_pmd_mkwrite
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+
 #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
 static inline int pud_trans_huge(pud_t pud)
 {
@@ -1686,6 +1689,9 @@ static inline bool arch_faults_on_old_pte(void)
return false;
 }
 
+#define maybe_mkwrite maybe_mkwrite
+extern pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index f6a9e2e36642..e778dbbef3d8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -610,6 +610,26 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
 }
 #endif
 
+pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+   if (likely(vma->vm_flags & VM_WRITE))
+   pte = pte_mkwrite(pte);
+   else if (likely(vma->vm_flags & VM_SHADOW_STACK))
+   pte = pte_mkwrite_shstk(pte);
+   return pte;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+   if (likely(vma->vm_flags & VM_WRITE))
+   pmd = pmd_mkwrite(pmd);
+   else if (likely(vma->vm_flags & VM_SHADOW_STACK))
+   pmd = pmd_mkwrite_shstk(pmd);
+   return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 /**
  * reserve_top_address - reserves a hole in the top of kernel address space
  * @reserve - size of hole to reserve
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 08282eb2f195..6ac9b3e9a865 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -993,12 +993,14 @@ void free_compound_page(struct page *page);
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
  */
+#ifndef maybe_mkwrite
 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 {
if (likely(vma->vm_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
 }
+#endif
 
 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
 void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ae907a9c2050..8203bd6ae4bd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -478,12 +478,14 @@ static int __init setup_transparent_hugepage(char *str)
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
 
+#ifndef maybe_pmd_mkwrite
 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
if (likely(vma->vm_flags & VM_WRITE))
pmd = pmd_mkwrite(pmd);
return pmd;
 }
+#endif
 
 #ifdef CONFIG_MEMCG
 static inline struct deferred_split *get_deferred_split_queue(struct page 
*page)
-- 
2.21.0



[PATCH v25 14/30] x86/mm: Shadow Stack page fault error checking

2021-04-15 Thread Yu-cheng Yu
Shadow stack accesses are those that are performed by the CPU where it
expects to encounter a shadow stack mapping.  These accesses are performed
implicitly by CALL/RET at the site of the shadow stack pointer.  These
accesses are made explicitly by shadow stack management instructions like
WRUSSQ.

Shadow stacks accesses to shadow-stack mapping can see faults in normal,
valid operation just like regular accesses to regular mappings.  Shadow
stacks need some of the same features like delayed allocation, swap and
copy-on-write.

Shadow stack accesses can also result in errors, such as when a shadow
stack overflows, or if a shadow stack access occurs to a non-shadow-stack
mapping.

In handling a shadow stack page fault, verify it occurs within a shadow
stack mapping.  It is always an error otherwise.  For valid shadow stack
accesses, set FAULT_FLAG_WRITE to effect copy-on-write.  Because clearing
_PAGE_DIRTY (vs. _PAGE_RW) is used to trigger the fault, shadow stack read
fault and shadow stack write fault are not differentiated and both are
handled as a write access.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Reviewed-by: Kirill A. Shutemov 
---
v24:
- Change VM_SHSTK to VM_SHADOW_STACK.

 arch/x86/include/asm/trap_pf.h |  2 ++
 arch/x86/mm/fault.c| 19 +++
 2 files changed, 21 insertions(+)

diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
index 10b1de500ab1..afa524325e55 100644
--- a/arch/x86/include/asm/trap_pf.h
+++ b/arch/x86/include/asm/trap_pf.h
@@ -11,6 +11,7 @@
  *   bit 3 ==  1: use of reserved bit detected
  *   bit 4 ==  1: fault was an instruction fetch
  *   bit 5 ==  1: protection keys block access
+ *   bit 6 ==  1: shadow stack access fault
  *   bit 15 == 1: SGX MMU page-fault
  */
 enum x86_pf_error_code {
@@ -20,6 +21,7 @@ enum x86_pf_error_code {
X86_PF_RSVD =   1 << 3,
X86_PF_INSTR=   1 << 4,
X86_PF_PK   =   1 << 5,
+   X86_PF_SHSTK=   1 << 6,
X86_PF_SGX  =   1 << 15,
 };
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a73347e2cdfc..394e504305b7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1100,6 +1100,17 @@ access_error(unsigned long error_code, struct 
vm_area_struct *vma)
   (error_code & X86_PF_INSTR), foreign))
return 1;
 
+   /*
+* Verify a shadow stack access is within a shadow stack VMA.
+* It is always an error otherwise.  Normal data access to a
+* shadow stack area is checked in the case followed.
+*/
+   if (error_code & X86_PF_SHSTK) {
+   if (!(vma->vm_flags & VM_SHADOW_STACK))
+   return 1;
+   return 0;
+   }
+
if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
@@ -1293,6 +1304,14 @@ void do_user_addr_fault(struct pt_regs *regs,
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
+   /*
+* Clearing _PAGE_DIRTY is used to detect shadow stack access.
+* This method cannot distinguish shadow stack read vs. write.
+* For valid shadow stack accesses, set FAULT_FLAG_WRITE to effect
+* copy-on-write.
+*/
+   if (error_code & X86_PF_SHSTK)
+   flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
-- 
2.21.0



[PATCH v25 13/30] mm: Introduce VM_SHADOW_STACK for shadow stack memory

2021-04-15 Thread Yu-cheng Yu
A shadow stack PTE must be read-only and have _PAGE_DIRTY set.  However,
read-only and Dirty PTEs also exist for copy-on-write (COW) pages.  These
two cases are handled differently for page faults.  Introduce
VM_SHADOW_STACK to track shadow stack VMAs.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kirill A. Shutemov 
Cc: Kees Cook 
---
v24:
- Change VM_SHSTK to VM_SHADOW_STACK.
- Change CONFIG_X86_CET to CONFIG_X86_SHADOW_STACK.

 Documentation/filesystems/proc.rst | 1 +
 arch/x86/mm/mmap.c | 2 ++
 fs/proc/task_mmu.c | 3 +++
 include/linux/mm.h | 8 
 4 files changed, 14 insertions(+)

diff --git a/Documentation/filesystems/proc.rst 
b/Documentation/filesystems/proc.rst
index 48fbfc336ebf..5d8a2d75c799 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -549,6 +549,7 @@ encoded manner. The codes are the following:
 mgmergable advise flag
 btarm64 BTI guarded page
 mtarm64 MTE allocation tags are enabled
+ssshadow stack page
 =====
 
 Note that there is no guarantee that every flag and associated mnemonic will
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c90c20904a60..f3f52c5e2fd6 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -165,6 +165,8 @@ unsigned long get_mmap_base(int is_legacy)
 
 const char *arch_vma_name(struct vm_area_struct *vma)
 {
+   if (vma->vm_flags & VM_SHADOW_STACK)
+   return "[shadow stack]";
return NULL;
 }
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e862cab69583..0aa57de9dfab 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct 
vm_area_struct *vma)
[ilog2(VM_PKEY_BIT4)]   = "",
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_ARCH_HAS_SHADOW_STACK
+   [ilog2(VM_SHADOW_STACK)]= "ss",
+#endif
};
size_t i;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8ba434287387..08282eb2f195 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -312,11 +312,13 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_2 34  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_BIT_3 35  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_BIT_4 36  /* bit only usable on 64-bit 
architectures */
+#define VM_HIGH_ARCH_BIT_5 37  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
 #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
@@ -332,6 +334,12 @@ extern unsigned int kobjsize(const void *objp);
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
 
+#ifdef CONFIG_X86_SHADOW_STACK
+# define VM_SHADOW_STACK   VM_HIGH_ARCH_5
+#else
+# define VM_SHADOW_STACK   VM_NONE
+#endif
+
 #if defined(CONFIG_X86)
 # define VM_PATVM_ARCH_1   /* PAT reserves whole VMA at 
once (x86) */
 #elif defined(CONFIG_PPC)
-- 
2.21.0



[PATCH v25 12/30] x86/mm: Update ptep_set_wrprotect() and pmdp_set_wrprotect() for transition from _PAGE_DIRTY to _PAGE_COW

2021-04-15 Thread Yu-cheng Yu
When Shadow Stack is introduced, [R/O + _PAGE_DIRTY] PTE is reserved for
shadow stack.  Copy-on-write PTEs have [R/O + _PAGE_COW].

When a PTE goes from [R/W + _PAGE_DIRTY] to [R/O + _PAGE_COW], it could
become a transient shadow stack PTE in two cases:

The first case is that some processors can start a write but end up seeing
a read-only PTE by the time they get to the Dirty bit, creating a transient
shadow stack PTE.  However, this will not occur on processors supporting
Shadow Stack, and a TLB flush is not necessary.

The second case is that when _PAGE_DIRTY is replaced with _PAGE_COW non-
atomically, a transient shadow stack PTE can be created as a result.
Thus, prevent that with cmpxchg.

Dave Hansen, Jann Horn, Andy Lutomirski, and Peter Zijlstra provided many
insights to the issue.  Jann Horn provided the cmpxchg solution.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Reviewed-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/pgtable.h | 36 ++
 1 file changed, 36 insertions(+)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e1739f590ca6..46d9394b884f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1306,6 +1306,24 @@ static inline pte_t ptep_get_and_clear_full(struct 
mm_struct *mm,
 static inline void ptep_set_wrprotect(struct mm_struct *mm,
  unsigned long addr, pte_t *ptep)
 {
+   /*
+* If Shadow Stack is enabled, pte_wrprotect() moves _PAGE_DIRTY
+* to _PAGE_COW (see comments at pte_wrprotect()).
+* When a thread reads a RW=1, Dirty=0 PTE and before changing it
+* to RW=0, Dirty=0, another thread could have written to the page
+* and the PTE is RW=1, Dirty=1 now.  Use try_cmpxchg() to detect
+* PTE changes and update old_pte, then try again.
+*/
+   if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+   pte_t old_pte, new_pte;
+
+   old_pte = READ_ONCE(*ptep);
+   do {
+   new_pte = pte_wrprotect(old_pte);
+   } while (!try_cmpxchg(&ptep->pte, &old_pte.pte, new_pte.pte));
+
+   return;
+   }
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
 }
 
@@ -1350,6 +1368,24 @@ static inline pud_t pudp_huge_get_and_clear(struct 
mm_struct *mm,
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  unsigned long addr, pmd_t *pmdp)
 {
+   /*
+* If Shadow Stack is enabled, pmd_wrprotect() moves _PAGE_DIRTY
+* to _PAGE_COW (see comments at pmd_wrprotect()).
+* When a thread reads a RW=1, Dirty=0 PMD and before changing it
+* to RW=0, Dirty=0, another thread could have written to the page
+* and the PMD is RW=1, Dirty=1 now.  Use try_cmpxchg() to detect
+* PMD changes and update old_pmd, then try again.
+*/
+   if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+   pmd_t old_pmd, new_pmd;
+
+   old_pmd = READ_ONCE(*pmdp);
+   do {
+   new_pmd = pmd_wrprotect(old_pmd);
+   } while (!try_cmpxchg((pmdval_t *)pmdp, (pmdval_t *)&old_pmd, 
pmd_val(new_pmd)));
+
+   return;
+   }
clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
 }
 
-- 
2.21.0



[PATCH v25 11/30] x86/mm: Update pte_modify for _PAGE_COW

2021-04-15 Thread Yu-cheng Yu
The read-only and Dirty PTE has been used to indicate copy-on-write pages.
However, newer x86 processors also regard a read-only and Dirty PTE as a
shadow stack page.  In order to separate the two, the software-defined
_PAGE_COW is created to replace _PAGE_DIRTY for the copy-on-write case, and
pte_*() are updated.

Pte_modify() changes a PTE to 'newprot', but it doesn't use the pte_*().
Introduce fixup_dirty_pte(), which sets a dirty PTE, based on _PAGE_RW,
to either _PAGE_DIRTY or _PAGE_COW.

Apply the same changes to pmd_modify().

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/pgtable.h | 37 ++
 1 file changed, 37 insertions(+)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 9c056d5815de..e1739f590ca6 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -799,6 +799,23 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd)
 
 static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
 
+static inline pteval_t fixup_dirty_pte(pteval_t pteval)
+{
+   pte_t pte = __pte(pteval);
+
+   /*
+* Fix up potential shadow stack page flags because the RO, Dirty
+* PTE is special.
+*/
+   if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+   if (pte_dirty(pte)) {
+   pte = pte_mkclean(pte);
+   pte = pte_mkdirty(pte);
+   }
+   }
+   return pte_val(pte);
+}
+
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
pteval_t val = pte_val(pte), oldval = val;
@@ -809,16 +826,36 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t 
newprot)
 */
val &= _PAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
+   val = fixup_dirty_pte(val);
val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
return __pte(val);
 }
 
+static inline int pmd_write(pmd_t pmd);
+static inline pmdval_t fixup_dirty_pmd(pmdval_t pmdval)
+{
+   pmd_t pmd = __pmd(pmdval);
+
+   /*
+* Fix up potential shadow stack page flags because the RO, Dirty
+* PMD is special.
+*/
+   if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+   if (pmd_dirty(pmd)) {
+   pmd = pmd_mkclean(pmd);
+   pmd = pmd_mkdirty(pmd);
+   }
+   }
+   return pmd_val(pmd);
+}
+
 static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 {
pmdval_t val = pmd_val(pmd), oldval = val;
 
val &= _HPAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+   val = fixup_dirty_pmd(val);
val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
return __pmd(val);
 }
-- 
2.21.0



[PATCH v25 09/30] x86/mm: Introduce _PAGE_COW

2021-04-15 Thread Yu-cheng Yu
There is essentially no room left in the x86 hardware PTEs on some OSes
(not Linux).  That left the hardware architects looking for a way to
represent a new memory type (shadow stack) within the existing bits.
They chose to repurpose a lightly-used state: Write=0, Dirty=1.

The reason it's lightly used is that Dirty=1 is normally set by hardware
and cannot normally be set by hardware on a Write=0 PTE.  Software must
normally be involved to create one of these PTEs, so software can simply
opt to not create them.

In places where Linux normally creates Write=0, Dirty=1, it can use the
software-defined _PAGE_COW in place of the hardware _PAGE_DIRTY.  In other
words, whenever Linux needs to create Write=0, Dirty=1, it instead creates
Write=0, Cow=1, except for shadow stack, which is Write=0, Dirty=1.  This
clearly separates shadow stack from other data, and results in the
following:

(a) A modified, copy-on-write (COW) page: (Write=0, Cow=1)
(b) A R/O page that has been COW'ed: (Write=0, Cow=1)
The user page is in a R/O VMA, and get_user_pages() needs a writable
copy.  The page fault handler creates a copy of the page and sets
the new copy's PTE as Write=0 and Cow=1.
(c) A shadow stack PTE: (Write=0, Dirty=1)
(d) A shared shadow stack PTE: (Write=0, Cow=1)
When a shadow stack page is being shared among processes (this happens
at fork()), its PTE is made Dirty=0, so the next shadow stack access
causes a fault, and the page is duplicated and Dirty=1 is set again.
This is the COW equivalent for shadow stack pages, even though it's
copy-on-access rather than copy-on-write.
(e) A page where the processor observed a Write=1 PTE, started a write, set
Dirty=1, but then observed a Write=0 PTE.  That's possible today, but
will not happen on processors that support shadow stack.

Define _PAGE_COW and update pte_*() helpers and apply the same changes to
pmd and pud.

After this, there are six free bits left in the 64-bit PTE, and no more
free bits in the 32-bit PTE (except for PAE) and Shadow Stack is not
implemented for the 32-bit kernel.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kirill A. Shutemov 
---
v24:
- Replace CONFIG_X86_CET with CONFIG_X86_SHADOW_STACK.

 arch/x86/include/asm/pgtable.h   | 195 ---
 arch/x86/include/asm/pgtable_types.h |  42 +-
 2 files changed, 216 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index c1650d0af1b5..9c056d5815de 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -121,11 +121,21 @@ extern pmdval_t early_pmd_flags;
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
  */
-static inline int pte_dirty(pte_t pte)
+static inline bool pte_dirty(pte_t pte)
 {
-   return pte_flags(pte) & _PAGE_DIRTY;
+   /*
+* A dirty PTE has Dirty=1 or Cow=1.
+*/
+   return pte_flags(pte) & _PAGE_DIRTY_BITS;
 }
 
+static inline bool pte_shstk(pte_t pte)
+{
+   if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+   return false;
+
+   return (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
+}
 
 static inline u32 read_pkru(void)
 {
@@ -160,9 +170,20 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
 }
 
-static inline int pmd_dirty(pmd_t pmd)
+static inline bool pmd_dirty(pmd_t pmd)
+{
+   /*
+* A dirty PMD has Dirty=1 or Cow=1.
+*/
+   return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pmd_shstk(pmd_t pmd)
 {
-   return pmd_flags(pmd) & _PAGE_DIRTY;
+   if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+   return false;
+
+   return (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
 }
 
 static inline int pmd_young(pmd_t pmd)
@@ -170,9 +191,12 @@ static inline int pmd_young(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_ACCESSED;
 }
 
-static inline int pud_dirty(pud_t pud)
+static inline bool pud_dirty(pud_t pud)
 {
-   return pud_flags(pud) & _PAGE_DIRTY;
+   /*
+* A dirty PUD has Dirty=1 or Cow=1.
+*/
+   return pud_flags(pud) & _PAGE_DIRTY_BITS;
 }
 
 static inline int pud_young(pud_t pud)
@@ -182,13 +206,23 @@ static inline int pud_young(pud_t pud)
 
 static inline int pte_write(pte_t pte)
 {
-   return pte_flags(pte) & _PAGE_RW;
+   /*
+* Shadow stack pages are always writable - but not by normal
+* instructions, and only by shadow stack operations.  Therefore,
+* the W=0,D=1 test with pte_shstk().
+*/
+   return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
 }
 
 #define pmd_write pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
-   return pmd_flags(pmd) & _PAGE_RW;
+   /*
+* Shadow stack pages are always writable - but not by normal
+* instructions, and 

[PATCH v25 10/30] drm/i915/gvt: Change _PAGE_DIRTY to _PAGE_DIRTY_BITS

2021-04-15 Thread Yu-cheng Yu
After the introduction of _PAGE_COW, a modified page's PTE can have either
_PAGE_DIRTY or _PAGE_COW.  Change _PAGE_DIRTY to _PAGE_DIRTY_BITS.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Reviewed-by: Kirill A. Shutemov 
Cc: David Airlie 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Daniel Vetter 
Cc: Rodrigo Vivi 
Cc: Zhenyu Wang 
Cc: Zhi Wang 
---
 drivers/gpu/drm/i915/gvt/gtt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 897c007ea96a..937b6083b2dc 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -1216,7 +1216,7 @@ static int split_2MB_gtt_entry(struct intel_vgpu *vgpu,
}
 
/* Clear dirty field. */
-   se->val64 &= ~_PAGE_DIRTY;
+   se->val64 &= ~_PAGE_DIRTY_BITS;
 
ops->clear_pse(se);
ops->clear_ips(se);
-- 
2.21.0



[PATCH v25 08/30] x86/mm: Move pmd_write(), pud_write() up in the file

2021-04-15 Thread Yu-cheng Yu
To prepare the introduction of _PAGE_COW, move pmd_write() and
pud_write() up in the file, so that they can be used by other
helpers below.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/pgtable.h | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a02c67291cfc..c1650d0af1b5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -185,6 +185,18 @@ static inline int pte_write(pte_t pte)
return pte_flags(pte) & _PAGE_RW;
 }
 
+#define pmd_write pmd_write
+static inline int pmd_write(pmd_t pmd)
+{
+   return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+   return pud_flags(pud) & _PAGE_RW;
+}
+
 static inline int pte_huge(pte_t pte)
 {
return pte_flags(pte) & _PAGE_PSE;
@@ -1128,12 +1140,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct 
*vma,
  unsigned long address, pmd_t *pmdp);
 
 
-#define pmd_write pmd_write
-static inline int pmd_write(pmd_t pmd)
-{
-   return pmd_flags(pmd) & _PAGE_RW;
-}
-
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned 
long addr,
   pmd_t *pmdp)
@@ -1155,12 +1161,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct 
*mm,
clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
 }
 
-#define pud_write pud_write
-static inline int pud_write(pud_t pud)
-{
-   return pud_flags(pud) & _PAGE_RW;
-}
-
 #ifndef pmdp_establish
 #define pmdp_establish pmdp_establish
 static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
-- 
2.21.0



[PATCH v25 07/30] x86/mm: Remove _PAGE_DIRTY from kernel RO pages

2021-04-15 Thread Yu-cheng Yu
The x86 family of processors do not directly create read-only and Dirty
PTEs.  These PTEs are created by software.  One such case is that kernel
read-only pages are historically setup as Dirty.

New processors that support Shadow Stack regard read-only and Dirty PTEs as
shadow stack pages.  This results in ambiguity between shadow stack and
kernel read-only pages.  To resolve this, removed Dirty from kernel read-
only pages.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kirill A. Shutemov 
Cc: "H. Peter Anvin" 
Cc: Kees Cook 
Cc: Thomas Gleixner 
Cc: Dave Hansen 
Cc: Christoph Hellwig 
Cc: Andy Lutomirski 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Peter Zijlstra 
---
 arch/x86/include/asm/pgtable_types.h | 6 +++---
 arch/x86/mm/pat/set_memory.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index f24d7ef8fffa..9db61817dfff 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -192,10 +192,10 @@ enum page_cache_mode {
 #define _KERNPG_TABLE   (__PP|__RW|   0|___A|   0|___D|   0|   0| _ENC)
 #define _PAGE_TABLE_NOENC   (__PP|__RW|_USR|___A|   0|___D|   0|   0)
 #define _PAGE_TABLE (__PP|__RW|_USR|___A|   0|___D|   0|   0| _ENC)
-#define __PAGE_KERNEL_RO(__PP|   0|   0|___A|__NX|___D|   0|___G)
-#define __PAGE_KERNEL_ROX   (__PP|   0|   0|___A|   0|___D|   0|___G)
+#define __PAGE_KERNEL_RO(__PP|   0|   0|___A|__NX|   0|   0|___G)
+#define __PAGE_KERNEL_ROX   (__PP|   0|   0|___A|   0|   0|   0|___G)
 #define __PAGE_KERNEL_NOCACHE   (__PP|__RW|   0|___A|__NX|___D|   0|___G| __NC)
-#define __PAGE_KERNEL_VVAR  (__PP|   0|_USR|___A|__NX|___D|   0|___G)
+#define __PAGE_KERNEL_VVAR  (__PP|   0|_USR|___A|__NX|   0|   0|___G)
 #define __PAGE_KERNEL_LARGE (__PP|__RW|   0|___A|__NX|___D|_PSE|___G)
 #define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW|   0|___A|   0|___D|_PSE|___G)
 #define __PAGE_KERNEL_WP(__PP|__RW|   0|___A|__NX|___D|   0|___G| __WP)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..6bebb95a6988 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1932,7 +1932,7 @@ int set_memory_nx(unsigned long addr, int numpages)
 
 int set_memory_ro(unsigned long addr, int numpages)
 {
-   return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
+   return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | 
_PAGE_DIRTY), 0);
 }
 
 int set_memory_rw(unsigned long addr, int numpages)
-- 
2.21.0



[PATCH v25 06/30] x86/cet: Add control-protection fault handler

2021-04-15 Thread Yu-cheng Yu
A control-protection fault is triggered when a control-flow transfer
attempt violates Shadow Stack or Indirect Branch Tracking constraints.
For example, the return address for a RET instruction differs from the copy
on the shadow stack; or an indirect JMP instruction, without the NOTRACK
prefix, arrives at a non-ENDBR opcode.

The control-protection fault handler works in a similar way as the general
protection fault handler.  It provides the si_code SEGV_CPERR to the signal
handler.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
Cc: Michael Kerrisk 
---
v25:
- Change CONFIG_X86_CET to CONFIG_X86_SHADOW_STACK.
- Change X86_FEATURE_CET to X86_FEATURE_SHSTK.

 arch/x86/include/asm/idtentry.h|  4 ++
 arch/x86/kernel/idt.c  |  4 ++
 arch/x86/kernel/signal_compat.c|  2 +-
 arch/x86/kernel/traps.c| 63 ++
 include/uapi/asm-generic/siginfo.h |  3 +-
 5 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 5eb3bdf36a41..5791c02864ec 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -571,6 +571,10 @@ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_SS,
exc_stack_segment);
 DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP,exc_general_protection);
 DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC,exc_alignment_check);
 
+#ifdef CONFIG_X86_SHADOW_STACK
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection);
+#endif
+
 /* Raw exception entries which need extra work */
 DECLARE_IDTENTRY_RAW(X86_TRAP_UD,  exc_invalid_op);
 DECLARE_IDTENTRY_RAW(X86_TRAP_BP,  exc_int3);
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index ee1a283f8e96..0315fb297dd3 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -105,6 +105,10 @@ static const __initconst struct idt_data def_idts[] = {
 #elif defined(CONFIG_X86_32)
SYSG(IA32_SYSCALL_VECTOR,   entry_INT80_32),
 #endif
+
+#ifdef CONFIG_X86_SHADOW_STACK
+   INTG(X86_TRAP_CP,   asm_exc_control_protection),
+#endif
 };
 
 /*
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a5330ff498f0..dd92490b1e7f 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -27,7 +27,7 @@ static inline void signal_compat_build_tests(void)
 */
BUILD_BUG_ON(NSIGILL  != 11);
BUILD_BUG_ON(NSIGFPE  != 15);
-   BUILD_BUG_ON(NSIGSEGV != 9);
+   BUILD_BUG_ON(NSIGSEGV != 10);
BUILD_BUG_ON(NSIGBUS  != 5);
BUILD_BUG_ON(NSIGTRAP != 5);
BUILD_BUG_ON(NSIGCHLD != 6);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 651e3e508959..a40b34b09400 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -606,6 +607,68 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
cond_local_irq_disable(regs);
 }
 
+#ifdef CONFIG_X86_SHADOW_STACK
+static const char * const control_protection_err[] = {
+   "unknown",
+   "near-ret",
+   "far-ret/iret",
+   "endbranch",
+   "rstorssp",
+   "setssbsy",
+   "unknown",
+};
+
+static DEFINE_RATELIMIT_STATE(cpf_rate, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+/*
+ * When a control protection exception occurs, send a signal to the responsible
+ * application.  Currently, control protection is only enabled for user mode.
+ * This exception should not come from kernel mode.
+ */
+DEFINE_IDTENTRY_ERRORCODE(exc_control_protection)
+{
+   struct task_struct *tsk;
+
+   if (!user_mode(regs)) {
+   pr_emerg("PANIC: unexpected kernel control protection fault\n");
+   die("kernel control protection fault", regs, error_code);
+   panic("Machine halted.");
+   }
+
+   cond_local_irq_enable(regs);
+
+   if (!boot_cpu_has(X86_FEATURE_SHSTK))
+   WARN_ONCE(1, "Control protection fault with CET support 
disabled\n");
+
+   tsk = current;
+   tsk->thread.error_code = error_code;
+   tsk->thread.trap_nr = X86_TRAP_CP;
+
+   /*
+* Ratelimit to prevent log spamming.
+*/
+   if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+   __ratelimit(&cpf_rate)) {
+   unsigned long ssp;
+   int cpf_type;
+
+   cpf_type = array_index_nospec(error_code, 
ARRAY_SIZE(control_protection_err));
+
+   rdmsrl(MSR_IA32_PL3_SSP, ssp);
+   pr_emerg("%s[%d] control protection ip:%lx sp:%lx ssp:%lx 
error:%lx(%s)",
+tsk->comm, task_pid_nr(tsk),
+regs->ip, regs->sp, ssp, erro

[PATCH v25 04/30] x86/cpufeatures: Introduce CPU setup and option parsing for CET

2021-04-15 Thread Yu-cheng Yu
Introduce CPU setup and boot option parsing for CET features.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v25:
- Remove software-defined X86_FEATURE_CET.
v24:
- Update #ifdef placement to reflect Kconfig changes of splitting shadow stack 
and ibt.

 arch/x86/include/uapi/asm/processor-flags.h |  2 ++
 arch/x86/kernel/cpu/common.c| 14 ++
 2 files changed, 16 insertions(+)

diff --git a/arch/x86/include/uapi/asm/processor-flags.h 
b/arch/x86/include/uapi/asm/processor-flags.h
index bcba3c643e63..a8df907e8017 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -130,6 +130,8 @@
 #define X86_CR4_SMAP   _BITUL(X86_CR4_SMAP_BIT)
 #define X86_CR4_PKE_BIT22 /* enable Protection Keys support */
 #define X86_CR4_PKE_BITUL(X86_CR4_PKE_BIT)
+#define X86_CR4_CET_BIT23 /* enable Control-flow Enforcement */
+#define X86_CR4_CET_BITUL(X86_CR4_CET_BIT)
 
 /*
  * x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ab640abe26b6..b6eeb5f2ae4d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -510,6 +510,14 @@ static __init int setup_disable_pku(char *arg)
 __setup("nopku", setup_disable_pku);
 #endif /* CONFIG_X86_64 */
 
+static __always_inline void setup_cet(struct cpuinfo_x86 *c)
+{
+   if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+   return;
+
+   cr4_set_bits(X86_CR4_CET);
+}
+
 /*
  * Some CPU features depend on higher CPUID levels, which may not always
  * be available due to CPUID level capping or broken virtualization
@@ -1255,6 +1263,11 @@ static void __init cpu_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
 
+   if (cmdline_find_option_bool(boot_command_line, "no_user_shstk"))
+   setup_clear_cpu_cap(X86_FEATURE_SHSTK);
+   if (cmdline_find_option_bool(boot_command_line, "no_user_ibt"))
+   setup_clear_cpu_cap(X86_FEATURE_IBT);
+
arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, 
sizeof(arg));
if (arglen <= 0)
return;
@@ -1594,6 +1607,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 
x86_init_rdrand(c);
setup_pku(c);
+   setup_cet(c);
 
/*
 * Clear/Set all flags overridden by options, need do it
-- 
2.21.0



[PATCH v25 00/30] Control-flow Enforcement: Shadow Stack

2021-04-15 Thread Yu-cheng Yu
Control-flow Enforcement (CET) is a new Intel processor feature that blocks
return/jump-oriented programming attacks.  Details are in "Intel 64 and
IA-32 Architectures Software Developer's Manual" [1].

CET can protect applications and the kernel.  This series enables only
application-level protection, and has three parts:

  - Shadow stack [2],
  - Indirect branch tracking [3], and
  - Selftests [4].

I have run tests on these patches for quite some time, and they have been
very stable.  Linux distributions with CET are available now, and Intel
processors with CET are already on the market.  It would be nice if CET
support can be accepted into the kernel.  I will be working to address any
issues should they come up.

Changes in v25:
- Remove Kconfig X86_CET and software-defined feature flag X86_FEATURE_CET.
  Use X86_SHADOW_STACK and X86_FEATURE_SHSTK directly.  Update related
  areas accordingly.
- Patch #16: Make same changes to do_huge_pmd_numa_page() as to
  do_numa_page().
- Patch #25: Update signal handling, use restorer address already
  retrieved, update MSR restoring code.
- Smaller changes are called out in each patch.
- Rebase to Linus tree v5.12-rc7.

[1] Intel 64 and IA-32 Architectures Software Developer's Manual:

https://software.intel.com/en-us/download/intel-64-and-ia-32-
architectures-sdm-combined-volumes-1-2a-2b-2c-2d-3a-3b-3c-3d-and-4

[2] Shadow Stack patches v24:

https://lore.kernel.org/r/20210401221104.31584-1-yu-cheng...@intel.com/

[3] Indirect Branch Tracking patches v24

https://lore.kernel.org/r/20210401221403.32253-1-yu-cheng...@intel.com/

[4] I am holding off the selftests changes and working to get Reviewed-by's.
The earlier version of the selftests patches:

https://lkml.kernel.org/r/20200521211720.20236-1-yu-cheng...@intel.com/

[5] The kernel ptrace patch is tested with an Intel-internal updated GDB.
I am holding off the kernel ptrace patch to re-test it with my earlier
patch for fixing regset holes.

Yu-cheng Yu (30):
  Documentation/x86: Add CET description
  x86/cet/shstk: Add Kconfig option for Shadow Stack
  x86/cpufeatures: Add CET CPU feature flags for Control-flow
Enforcement Technology (CET)
  x86/cpufeatures: Introduce CPU setup and option parsing for CET
  x86/fpu/xstate: Introduce CET MSR and XSAVES supervisor states
  x86/cet: Add control-protection fault handler
  x86/mm: Remove _PAGE_DIRTY from kernel RO pages
  x86/mm: Move pmd_write(), pud_write() up in the file
  x86/mm: Introduce _PAGE_COW
  drm/i915/gvt: Change _PAGE_DIRTY to _PAGE_DIRTY_BITS
  x86/mm: Update pte_modify for _PAGE_COW
  x86/mm: Update ptep_set_wrprotect() and pmdp_set_wrprotect() for
transition from _PAGE_DIRTY to _PAGE_COW
  mm: Introduce VM_SHADOW_STACK for shadow stack memory
  x86/mm: Shadow Stack page fault error checking
  x86/mm: Update maybe_mkwrite() for shadow stack
  mm: Fixup places that call pte_mkwrite() directly
  mm: Add guard pages around a shadow stack.
  mm/mmap: Add shadow stack pages to memory accounting
  mm: Update can_follow_write_pte() for shadow stack
  mm/mprotect: Exclude shadow stack from preserve_write
  mm: Re-introduce vm_flags to do_mmap()
  x86/cet/shstk: Add user-mode shadow stack support
  x86/cet/shstk: Handle thread shadow stack
  x86/cet/shstk: Introduce shadow stack token setup/verify routines
  x86/cet/shstk: Handle signals for shadow stack
  ELF: Introduce arch_setup_elf_property()
  x86/cet/shstk: Add arch_prctl functions for shadow stack
  mm: Move arch_calc_vm_prot_bits() to arch/x86/include/asm/mman.h
  mm: Update arch_validate_flags() to include vma anonymous
  mm: Introduce PROT_SHSTK for shadow stack

 .../admin-guide/kernel-parameters.txt |   6 +
 Documentation/filesystems/proc.rst|   1 +
 Documentation/x86/index.rst   |   1 +
 Documentation/x86/intel_cet.rst   | 136 
 arch/arm64/include/asm/elf.h  |   5 +
 arch/arm64/include/asm/mman.h |   4 +-
 arch/sparc/include/asm/mman.h |   4 +-
 arch/x86/Kconfig  |  24 ++
 arch/x86/Kconfig.assembler|   5 +
 arch/x86/ia32/ia32_signal.c   |  24 +-
 arch/x86/include/asm/cet.h|  52 +++
 arch/x86/include/asm/cpufeatures.h|   2 +
 arch/x86/include/asm/disabled-features.h  |   8 +-
 arch/x86/include/asm/elf.h|  13 +
 arch/x86/include/asm/fpu/internal.h   |   2 +
 arch/x86/include/asm/fpu/types.h  |  23 +-
 arch/x86/include/asm/fpu/xstate.h |   6 +-
 arch/x86/include/asm/idtentry.h   |   4 +
 arch/x86/include/asm/mman.h   |  87 +
 arch/x86/include/asm/mmu_context.h|   3 +
 arch/x86/include/asm/msr-index.h  |  19 ++
 arch/x86/include/asm/page_types.h |   7 +
 arch/x86/include/asm/pgtable.h| 299 ++

[PATCH v25 05/30] x86/fpu/xstate: Introduce CET MSR and XSAVES supervisor states

2021-04-15 Thread Yu-cheng Yu
Control-flow Enforcement Technology (CET) introduces these MSRs:

MSR_IA32_U_CET (user-mode CET settings),
MSR_IA32_PL3_SSP (user-mode shadow stack pointer),

MSR_IA32_PL0_SSP (kernel-mode shadow stack pointer),
MSR_IA32_PL1_SSP (Privilege Level 1 shadow stack pointer),
MSR_IA32_PL2_SSP (Privilege Level 2 shadow stack pointer),
MSR_IA32_S_CET (kernel-mode CET settings),
MSR_IA32_INT_SSP_TAB (exception shadow stack table).

The two user-mode MSRs belong to XFEATURE_CET_USER.  The first three of
kernel-mode MSRs belong to XFEATURE_CET_KERNEL.  Both XSAVES states are
supervisor states.  This means that there is no direct, unprivileged access
to these states, making it harder for an attacker to subvert CET.

For sigreturn and future ptrace() support, shadow stack address and MSR
reserved bits are checked before written to the supervisor states.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v25:
- Update xsave_cpuid_features[].  Now CET XSAVES features depend on
  X86_FEATURE_SHSTK (vs. the software-defined X86_FEATURE_CET).

 arch/x86/include/asm/fpu/types.h  | 23 +--
 arch/x86/include/asm/fpu/xstate.h |  6 --
 arch/x86/include/asm/msr-index.h  | 19 +++
 arch/x86/kernel/fpu/xstate.c  | 10 +-
 4 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f5a38a5f3ae1..035eb0ec665e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -115,8 +115,8 @@ enum xfeature {
XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
XFEATURE_PKRU,
XFEATURE_PASID,
-   XFEATURE_RSRVD_COMP_11,
-   XFEATURE_RSRVD_COMP_12,
+   XFEATURE_CET_USER,
+   XFEATURE_CET_KERNEL,
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
@@ -135,6 +135,8 @@ enum xfeature {
 #define XFEATURE_MASK_PT   (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
 #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
 #define XFEATURE_MASK_PASID(1 << XFEATURE_PASID)
+#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER)
+#define XFEATURE_MASK_CET_KERNEL   (1 << XFEATURE_CET_KERNEL)
 #define XFEATURE_MASK_LBR  (1 << XFEATURE_LBR)
 
 #define XFEATURE_MASK_FPSSE(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
@@ -237,6 +239,23 @@ struct pkru_state {
u32 pad;
 } __packed;
 
+/*
+ * State component 11 is Control-flow Enforcement user states
+ */
+struct cet_user_state {
+   u64 user_cet;   /* user control-flow settings */
+   u64 user_ssp;   /* user shadow stack pointer */
+};
+
+/*
+ * State component 12 is Control-flow Enforcement kernel states
+ */
+struct cet_kernel_state {
+   u64 kernel_ssp; /* kernel shadow stack */
+   u64 pl1_ssp;/* privilege level 1 shadow stack */
+   u64 pl2_ssp;/* privilege level 2 shadow stack */
+};
+
 /*
  * State component 15: Architectural LBR configuration state.
  * The size of Arch LBR state depends on the number of LBRs (lbr_depth).
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 47a92232d595..582f3575e0bd 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -35,7 +35,8 @@
  XFEATURE_MASK_BNDCSR)
 
 /* All currently supported supervisor features */
-#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
+#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
+   XFEATURE_MASK_CET_USER)
 
 /*
  * A supervisor state component may not always contain valuable information,
@@ -62,7 +63,8 @@
  * Unsupported supervisor features. When a supervisor feature in this mask is
  * supported in the future, move it to the supported supervisor feature mask.
  */
-#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \
+ XFEATURE_MASK_CET_KERNEL)
 
 /* All supervisor states including supported and unsupported states. */
 #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 546d6ecf0a35..5f4b7edead0b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -933,4 +933,23 @@
 #define MSR_VM_IGNNE0xc0010115
 #define MSR_VM_HSAVE_PA 0xc0010117
 
+/* Control-flow Enforcement Technology MSRs */
+#define MSR_IA32_U_CET 0x06a0 /* user mode cet setting */
+#define MSR_IA32_S_CET 0x06a2 /* kernel mode cet setting */
+#define CET_SHSTK_EN   BIT_ULL(0)
+#define CET_WRSS_EN

[PATCH v25 03/30] x86/cpufeatures: Add CET CPU feature flags for Control-flow Enforcement Technology (CET)

2021-04-15 Thread Yu-cheng Yu
Add CPU feature flags for Control-flow Enforcement Technology (CET).

CPUID.(EAX=7,ECX=0):ECX[bit 7] Shadow stack
CPUID.(EAX=7,ECX=0):EDX[bit 20] Indirect Branch Tracking

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v25:
- Make X86_FEATURE_IBT depend on X86_FEATURE_SHSTK.
v24:
- Update for splitting CONFIG_X86_CET to CONFIG_X86_SHADOW_STACK and 
CONFIG_X86_IBT.
- Move DISABLE_IBT definition to the IBT series.

 arch/x86/include/asm/cpufeatures.h   | 2 ++
 arch/x86/include/asm/disabled-features.h | 8 +++-
 arch/x86/kernel/cpu/cpuid-deps.c | 2 ++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index cc96e26d69f7..bf861fc89fef 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -345,6 +345,7 @@
 #define X86_FEATURE_OSPKE  (16*32+ 4) /* OS Protection Keys Enable 
*/
 #define X86_FEATURE_WAITPKG(16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE 
Instructions */
 #define X86_FEATURE_AVX512_VBMI2   (16*32+ 6) /* Additional AVX512 Vector 
Bit Manipulation Instructions */
+#define X86_FEATURE_SHSTK  (16*32+ 7) /* Shadow Stack */
 #define X86_FEATURE_GFNI   (16*32+ 8) /* Galois Field New 
Instructions */
 #define X86_FEATURE_VAES   (16*32+ 9) /* Vector AES */
 #define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication 
Double Quadword */
@@ -377,6 +378,7 @@
 #define X86_FEATURE_TSXLDTRK   (18*32+16) /* TSX Suspend Load Address 
Tracking */
 #define X86_FEATURE_PCONFIG(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_ARCH_LBR   (18*32+19) /* Intel ARCH LBR */
+#define X86_FEATURE_IBT(18*32+20) /* Indirect Branch 
Tracking */
 #define X86_FEATURE_AVX512_FP16(18*32+23) /* AVX512 FP16 */
 #define X86_FEATURE_SPEC_CTRL  (18*32+26) /* "" Speculation Control 
(IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP(18*32+27) /* "" Single Thread 
Indirect Branch Predictors */
diff --git a/arch/x86/include/asm/disabled-features.h 
b/arch/x86/include/asm/disabled-features.h
index b7dd944dc867..e5c6ed9373e8 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -68,6 +68,12 @@
 # define DISABLE_SGX   (1 << (X86_FEATURE_SGX & 31))
 #endif
 
+#ifdef CONFIG_X86_SHADOW_STACK
+#define DISABLE_SHSTK  0
+#else
+#define DISABLE_SHSTK  (1 << (X86_FEATURE_SHSTK & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -88,7 +94,7 @@
 #define DISABLED_MASK140
 #define DISABLED_MASK150
 #define DISABLED_MASK16
(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
-DISABLE_ENQCMD)
+DISABLE_ENQCMD|DISABLE_SHSTK)
 #define DISABLED_MASK170
 #define DISABLED_MASK180
 #define DISABLED_MASK190
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 42af31b64c2c..c88b7e0ebf6d 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,8 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512_FP16,  X86_FEATURE_AVX512BW  },
{ X86_FEATURE_ENQCMD,   X86_FEATURE_XSAVES},
{ X86_FEATURE_PER_THREAD_MBA,   X86_FEATURE_MBA   },
+   { X86_FEATURE_SHSTK,X86_FEATURE_XSAVES},
+   { X86_FEATURE_IBT,  X86_FEATURE_SHSTK},
{}
 };
 
-- 
2.21.0



[PATCH v25 02/30] x86/cet/shstk: Add Kconfig option for Shadow Stack

2021-04-15 Thread Yu-cheng Yu
Shadow Stack provides protection against function return address
corruption.  It is active when the processor supports it, the kernel has
CONFIG_X86_SHADOW_STACK enabled, and the application is built for the
feature.  This is only implemented for the 64-bit kernel.  When it is
enabled, legacy non-Shadow Stack applications continue to work, but without
protection.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v25:
- Remove X86_CET and use X86_SHADOW_STACK directly.
v24:
- Update for the splitting X86_CET to X86_SHADOW_STACK and X86_IBT.

 arch/x86/Kconfig   | 22 ++
 arch/x86/Kconfig.assembler |  5 +
 2 files changed, 27 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2792879d398e..41283f82fd87 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86_64
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_USE_CMPXCHG_LOCKREF
+   select ARCH_HAS_SHADOW_STACK
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
@@ -1941,6 +1942,27 @@ config X86_SGX
 
  If unsure, say N.
 
+config ARCH_HAS_SHADOW_STACK
+   def_bool n
+
+config X86_SHADOW_STACK
+   prompt "Intel Shadow Stack"
+   def_bool n
+   depends on AS_WRUSS
+   depends on ARCH_HAS_SHADOW_STACK
+   select ARCH_USES_HIGH_VMA_FLAGS
+   help
+ Shadow Stack protection is a hardware feature that detects function
+ return address corruption.  This helps mitigate ROP attacks.
+ Applications must be enabled to use it, and old userspace does not
+ get protection "for free".
+ Support for this feature is present on Tiger Lake family of
+ processors released in 2020 or later.  Enabling this feature
+ increases kernel text size by 3.7 KB.
+ See Documentation/x86/intel_cet.rst for more information.
+
+ If unsure, say N.
+
 config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
index 26b8c08e2fc4..00c79dd93651 100644
--- a/arch/x86/Kconfig.assembler
+++ b/arch/x86/Kconfig.assembler
@@ -19,3 +19,8 @@ config AS_TPAUSE
def_bool $(as-instr,tpause %ecx)
help
  Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7
+
+config AS_WRUSS
+   def_bool $(as-instr,wrussq %rax$(comma)(%rbx))
+   help
+ Supported by binutils >= 2.31 and LLVM integrated assembler
-- 
2.21.0



[PATCH v25 01/30] Documentation/x86: Add CET description

2021-04-15 Thread Yu-cheng Yu
Explain no_user_shstk/no_user_ibt kernel parameters, and introduce a new
document on Control-flow Enforcement Technology (CET).

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Update for Kconfig changes from X86_CET to X86_SHADOW_STACK, X86_IBT.
- Update for the change of VM_SHSTK to VM_SHADOW_STACK.

 .../admin-guide/kernel-parameters.txt |   6 +
 Documentation/x86/index.rst   |   1 +
 Documentation/x86/intel_cet.rst   | 136 ++
 3 files changed, 143 insertions(+)
 create mode 100644 Documentation/x86/intel_cet.rst

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 04545725f187..bc79e54be91e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3220,6 +3220,12 @@
noexec=on: enable non-executable mappings (default)
noexec=off: disable non-executable mappings
 
+   no_user_shstk   [X86-64] Disable Shadow Stack for user-mode
+   applications
+
+   no_user_ibt [X86-64] Disable Indirect Branch Tracking for user-mode
+   applications
+
nosmap  [X86,PPC]
Disable SMAP (Supervisor Mode Access Prevention)
even if it is supported by processor.
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index 4693e192b447..cf5250a3cc70 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -21,6 +21,7 @@ x86-specific Documentation
tlb
mtrr
pat
+   intel_cet
intel-iommu
intel_txt
amd-memory-encryption
diff --git a/Documentation/x86/intel_cet.rst b/Documentation/x86/intel_cet.rst
new file mode 100644
index ..ae30c392994a
--- /dev/null
+++ b/Documentation/x86/intel_cet.rst
@@ -0,0 +1,136 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=
+Control-flow Enforcement Technology (CET)
+=
+
+[1] Overview
+
+
+Control-flow Enforcement Technology (CET) is an Intel processor feature
+that provides protection against return/jump-oriented programming (ROP)
+attacks.  It can be set up to protect both applications and the kernel.
+Only user-mode protection is implemented in the 64-bit kernel, including
+support for running legacy 32-bit applications.
+
+CET introduces Shadow Stack and Indirect Branch Tracking.  Shadow stack is
+a secondary stack allocated from memory and cannot be directly modified by
+applications.  When executing a CALL instruction, the processor pushes the
+return address to both the normal stack and the shadow stack.  Upon
+function return, the processor pops the shadow stack copy and compares it
+to the normal stack copy.  If the two differ, the processor raises a
+control-protection fault.  Indirect branch tracking verifies indirect
+CALL/JMP targets are intended as marked by the compiler with 'ENDBR'
+opcodes.
+
+There are two Kconfig options:
+
+X86_SHADOW_STACK, and X86_IBT.
+
+To build a CET-enabled kernel, Binutils v2.31 and GCC v8.1 or LLVM v10.0.1
+or later are required.  To build a CET-enabled application, GLIBC v2.28 or
+later is also required.
+
+There are two command-line options for disabling CET features::
+
+no_user_shstk - disables user shadow stack, and
+no_user_ibt   - disables user indirect branch tracking.
+
+At run time, /proc/cpuinfo shows CET features if the processor supports
+CET.
+
+[2] Application Enabling
+
+
+An application's CET capability is marked in its ELF header and can be
+verified from readelf/llvm-readelf output:
+
+readelf -n  | grep -a SHSTK
+properties: x86 feature: IBT, SHSTK
+
+If an application supports CET and is statically linked, it will run with
+CET protection.  If the application needs any shared libraries, the loader
+checks all dependencies and enables CET when all requirements are met.
+
+[3] Backward Compatibility
+==
+
+GLIBC provides a few CET tunables via the GLIBC_TUNABLES environment
+variable:
+
+GLIBC_TUNABLES=glibc.tune.hwcaps=-SHSTK,-IBT
+Turn off SHSTK/IBT.
+
+GLIBC_TUNABLES=glibc.tune.x86_shstk=
+This controls how dlopen() handles SHSTK legacy libraries::
+
+on - continue with SHSTK enabled;
+permissive - continue with SHSTK off.
+
+Details can be found in the GLIBC manual pages.
+
+[4] CET arch_prctl()'s
+==
+
+Several arch_prctl()'s have been added for CET:
+
+arch_prctl(ARCH_X86_CET_STATUS, u64 *addr)
+Return CET feature status.
+
+The parameter 'addr' is a pointer to a user buffer.
+On returning to the caller, the kernel fills the following
+information::
+
+*addr   = shadow stack/indirect branch tracking status
+*(addr + 1) = shadow stack base ad

[PATCH v24 7/9] x86/vdso: Introduce ENDBR macro

2021-04-01 Thread Yu-cheng Yu
ENDBR is a special new instruction for the Indirect Branch Tracking (IBT)
component of CET.  IBT prevents attacks by ensuring that (most) indirect
branches and function calls may only land at ENDBR instructions.  Branches
that don't follow the rules will result in control flow (#CF) exceptions.

ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
instructions are inserted automatically by the compiler, but branch
targets written in assembly must have ENDBR added manually.

There are two ENDBR versions: endbr64 and endbr32.  The compilers (gcc and
clang) have _CET_ENDBR defined for the proper one.  Introduce ENDBR macro,
which equals the compiler macro when enabled, otherwise nothing.

Signed-off-by: Yu-cheng Yu 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: Jarkko Sakkinen 
Cc: Peter Zijlstra 
---
 arch/x86/entry/vdso/Makefile |  1 +
 arch/x86/include/asm/vdso.h  | 19 ++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index a773a5f03b63..be2ce5c8cb42 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -95,6 +95,7 @@ $(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) 
$(GCC_PLUGINS_CFLAGS) $(
 
 ifdef CONFIG_X86_IBT
 $(vobjs) $(vobjs32): KBUILD_CFLAGS += -fcf-protection=branch
+$(vobjs) $(vobjs32): KBUILD_AFLAGS += -fcf-protection=branch
 endif
 
 #
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 98aa103eb4ab..0128486ba09f 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -52,6 +52,23 @@ extern int map_vdso_once(const struct vdso_image *image, 
unsigned long addr);
 extern bool fixup_vdso_exception(struct pt_regs *regs, int trapnr,
 unsigned long error_code,
 unsigned long fault_addr);
-#endif /* __ASSEMBLER__ */
+#else /* __ASSEMBLER__ */
+
+/*
+ * ENDBR is an instruction for the Indirect Branch Tracking (IBT) component
+ * of CET.  IBT prevents attacks by ensuring that (most) indirect branches
+ * function calls may only land at ENDBR instructions.  Branches that don't
+ * follow the rules will result in control flow (#CF) exceptions.
+ * ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
+ * instructions are inserted automatically by the compiler, but branch
+ * targets written in assembly must have ENDBR added manually.
+ */
+#ifdef __CET__
+#include 
+#define ENDBR _CET_ENDBR
+#else
+#define ENDBR
+#endif
 
+#endif /* __ASSEMBLER__ */
 #endif /* _ASM_X86_VDSO_H */
-- 
2.21.0



[PATCH v24 6/9] x86/vdso: Insert endbr32/endbr64 to vDSO

2021-04-01 Thread Yu-cheng Yu
From: "H.J. Lu" 

When Indirect Branch Tracking (IBT) is enabled, vDSO functions may be
called indirectly, and must have ENDBR32 or ENDBR64 as the first
instruction.  The compiler must support -fcf-protection=branch so that it
can be used to compile vDSO.

Signed-off-by: H.J. Lu 
Signed-off-by: Yu-cheng Yu 
Cc: Andy Lutomirski 
Cc: Kees Cook 
---
v24:
- Replace CONFIG_X86_CET with CONFIG_X86_IBT to reflect splitting of shadow
  stack and ibt.

 arch/x86/entry/vdso/Makefile | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 05c4abc2fdfd..a773a5f03b63 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -93,6 +93,10 @@ endif
 
 $(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(GCC_PLUGINS_CFLAGS) 
$(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
 
+ifdef CONFIG_X86_IBT
+$(vobjs) $(vobjs32): KBUILD_CFLAGS += -fcf-protection=branch
+endif
+
 #
 # vDSO code runs in userspace and -pg doesn't help with profiling anyway.
 #
-- 
2.21.0



[PATCH v24 9/9] x86/vdso: Add ENDBR to __vdso_sgx_enter_enclave

2021-04-01 Thread Yu-cheng Yu
ENDBR is a special new instruction for the Indirect Branch Tracking (IBT)
component of CET.  IBT prevents attacks by ensuring that (most) indirect
branches and function calls may only land at ENDBR instructions.  Branches
that don't follow the rules will result in control flow (#CF) exceptions.

ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
instructions are inserted automatically by the compiler, but branch
targets written in assembly must have ENDBR added manually.

Add ENDBR to __vdso_sgx_enter_enclave() branch targets.

Signed-off-by: Yu-cheng Yu 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: Jarkko Sakkinen 
Cc: Peter Zijlstra 
---
 arch/x86/entry/vdso/vsgx.S | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
index 86a0e94f68df..c63eafa54abd 100644
--- a/arch/x86/entry/vdso/vsgx.S
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -4,6 +4,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "extable.h"
 
@@ -27,6 +28,7 @@
 SYM_FUNC_START(__vdso_sgx_enter_enclave)
/* Prolog */
.cfi_startproc
+   ENDBR
push%rbp
.cfi_adjust_cfa_offset  8
.cfi_rel_offset %rbp, 0
@@ -62,6 +64,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
 .Lasync_exit_pointer:
 .Lenclu_eenter_eresume:
enclu
+   ENDBR
 
/* EEXIT jumps here unless the enclave is doing something fancy. */
mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
@@ -91,6 +94,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
jmp .Lout
 
 .Lhandle_exception:
+   ENDBR
mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
 
/* Set the exception info. */
-- 
2.21.0



[PATCH v24 5/9] x86/cet/ibt: Update arch_prctl functions for Indirect Branch Tracking

2021-04-01 Thread Yu-cheng Yu
From: "H.J. Lu" 

Update ARCH_X86_CET_STATUS and ARCH_X86_CET_DISABLE for Indirect Branch
Tracking.

Signed-off-by: H.J. Lu 
Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
v24:
- Update for function name changes introduced from splitting shadow stack and 
ibt.

 arch/x86/kernel/cet_prctl.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
index 5f0054177d2a..ae31741d829a 100644
--- a/arch/x86/kernel/cet_prctl.c
+++ b/arch/x86/kernel/cet_prctl.c
@@ -22,6 +22,9 @@ static int cet_copy_status_to_user(struct cet_status *cet, 
u64 __user *ubuf)
buf[2] = cet->shstk_size;
}
 
+   if (cet->ibt_enabled)
+   buf[0] |= GNU_PROPERTY_X86_FEATURE_1_IBT;
+
return copy_to_user(ubuf, buf, sizeof(buf));
 }
 
@@ -46,6 +49,8 @@ int prctl_cet(int option, u64 arg2)
return -EINVAL;
if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
shstk_disable();
+   if (arg2 & GNU_PROPERTY_X86_FEATURE_1_IBT)
+   ibt_disable();
return 0;
 
case ARCH_X86_CET_LOCK:
-- 
2.21.0



[PATCH v24 4/9] x86/cet/ibt: Update ELF header parsing for Indirect Branch Tracking

2021-04-01 Thread Yu-cheng Yu
An ELF file's .note.gnu.property indicates features the file supports.
The property is parsed at loading time and passed to arch_setup_elf_
property().  Update it for Indirect Branch Tracking.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Update for changes introduced from splitting shadow stack and ibt.

 arch/x86/Kconfig | 2 ++
 arch/x86/kernel/process_64.c | 8 
 2 files changed, 10 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a58c5230e957..5496a1b79318 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1975,6 +1975,8 @@ config X86_IBT
depends on X86_64
depends on $(cc-option,-fcf-protection)
select X86_CET
+   select ARCH_USE_GNU_PROPERTY
+   select ARCH_BINFMT_ELF_STATE
help
  Indirect Branch Tracking (IBT) provides protection against
  CALL-/JMP-oriented programming attacks.  It is active when
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 892d8e742e3b..8137e8af4503 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -864,6 +864,14 @@ int arch_setup_elf_property(struct arch_elf_state *state)
r = shstk_setup();
}
 
+   if (r < 0)
+   return r;
+
+   if (cpu_feature_enabled(X86_FEATURE_IBT)) {
+   if (state->gnu_property & GNU_PROPERTY_X86_FEATURE_1_IBT)
+   r = ibt_setup();
+   }
+
return r;
 }
 #endif
-- 
2.21.0



[PATCH v24 8/9] x86/vdso/32: Add ENDBR to __kernel_vsyscall entry point

2021-04-01 Thread Yu-cheng Yu
From: "H.J. Lu" 

ENDBR is a special new instruction for the Indirect Branch Tracking (IBT)
component of CET.  IBT prevents attacks by ensuring that (most) indirect
branches and function calls may only land at ENDBR instructions.  Branches
that don't follow the rules will result in control flow (#CF) exceptions.

ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
instructions are inserted automatically by the compiler, but branch
targets written in assembly must have ENDBR added manually.

Add that to __kernel_vsyscall entry point.

Signed-off-by: H.J. Lu 
Signed-off-by: Yu-cheng Yu 
Cc: Andy Lutomirski 
Cc: Kees Cook 
---
 arch/x86/entry/vdso/vdso32/system_call.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/entry/vdso/vdso32/system_call.S 
b/arch/x86/entry/vdso/vdso32/system_call.S
index de1fff7188aa..c962e7e4f7e3 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include 
 
.text
.globl __kernel_vsyscall
@@ -14,6 +15,7 @@
ALIGN
 __kernel_vsyscall:
CFI_STARTPROC
+   ENDBR
/*
 * Reshuffle regs so that all of any of the entry instructions
 * will preserve enough state.
-- 
2.21.0



[PATCH v24 3/9] x86/cet/ibt: Handle signals for Indirect Branch Tracking

2021-04-01 Thread Yu-cheng Yu
When an indirect CALL/JMP instruction is executed and before it reaches
the target, it is in 'WAIT_ENDBR' status, which can be read from
MSR_IA32_U_CET.  The status is part of a task's status before a signal is
raised and preserved in the signal frame.  It is restored for sigreturn.

IBT state machine is described in Intel SDM Vol. 1, Sec. 18.3.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Update for changes from splitting shadow stack and ibt.

 arch/x86/kernel/fpu/signal.c | 30 +++---
 1 file changed, 27 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 2e56f2fe8be0..1f54c18607c9 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -71,16 +71,32 @@ int save_extra_state_to_sigframe(int ia32, void __user *fp, 
unsigned long restor
return err;
 
ext.ssp = token_addr;
+   }
 
+   if (new_ssp || cet->ibt_enabled) {
fpregs_lock();
if (test_thread_flag(TIF_NEED_FPU_LOAD))
__fpregs_load_activate();
+
if (new_ssp)
wrmsrl(MSR_IA32_PL3_SSP, new_ssp);
+
+   if (cet->ibt_enabled) {
+   u64 r;
+
+   rdmsrl(MSR_IA32_U_CET, r);
+
+   if (r & CET_WAIT_ENDBR) {
+   ext.wait_endbr = 1;
+   r &= ~CET_WAIT_ENDBR;
+   wrmsrl(MSR_IA32_U_CET, r);
+   }
+   }
+
fpregs_unlock();
}
 
-   if (ext.ssp) {
+   if (ext.ssp || cet->ibt_enabled) {
void __user *p = fp;
 
ext.total_size = sizeof(ext);
@@ -110,7 +126,8 @@ static int get_extra_state_from_sigframe(int ia32, void 
__user *fp, struct sc_ex
if (!cpu_feature_enabled(X86_FEATURE_CET))
return 0;
 
-   if (!cet->shstk_size)
+   if (!cet->shstk_size &&
+   !cet->ibt_enabled)
return 0;
 
memset(ext, 0, sizeof(*ext));
@@ -162,6 +179,13 @@ void restore_extra_state(struct sc_ext *sc_ext)
msr_val |= CET_SHSTK_EN;
}
 
+   if (cet->ibt_enabled) {
+   msr_val |= (CET_ENDBR_EN | CET_NO_TRACK_EN);
+
+   if (sc_ext->wait_endbr)
+   msr_val |= CET_WAIT_ENDBR;
+   }
+
if (test_thread_flag(TIF_NEED_FPU_LOAD))
cet_user_state->user_cet = msr_val;
else
@@ -626,7 +650,7 @@ static unsigned long fpu__alloc_sigcontext_ext(unsigned 
long sp)
 * sigcontext_ext is at: fpu + fpu_user_xstate_size +
 * FP_XSTATE_MAGIC2_SIZE, then aligned to 8.
 */
-   if (cet->shstk_size)
+   if (cet->shstk_size || cet->ibt_enabled)
sp -= (sizeof(struct sc_ext) + 8);
 #endif
return sp;
-- 
2.21.0



[PATCH v24 1/9] x86/cet/ibt: Add Kconfig option for Indirect Branch Tracking

2021-04-01 Thread Yu-cheng Yu
Indirect Branch Tracking (IBT) provides protection against CALL-/JMP-
oriented programming attacks.  It is active when the kernel has this
feature enabled, and the processor and the application support it.
When this feature is enabled, legacy non-IBT applications continue to
work, but without IBT protection.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
 arch/x86/Kconfig | 20 
 arch/x86/include/asm/disabled-features.h |  8 +++-
 2 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index a69e351e7386..a58c5230e957 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1969,6 +1969,26 @@ config X86_SHADOW_STACK
 
  If unsure, say N.
 
+config X86_IBT
+   prompt "Intel Indirect Branch Tracking"
+   def_bool n
+   depends on X86_64
+   depends on $(cc-option,-fcf-protection)
+   select X86_CET
+   help
+ Indirect Branch Tracking (IBT) provides protection against
+ CALL-/JMP-oriented programming attacks.  It is active when
+ the kernel has this feature enabled, and the processor and
+ the application support it.  When this feature is enabled,
+ legacy non-IBT applications continue to work, but without
+ IBT protection.
+ Support for this feature is present on Tiger Lake family of
+ processors released in 2020 or later.  Enabling this feature
+ increases kernel text size by 3.7 KB.
+ See Documentation/x86/intel_cet.rst for more information.
+
+ If unsure, say N.
+
 config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/include/asm/disabled-features.h 
b/arch/x86/include/asm/disabled-features.h
index 018cd7acd3e9..9b826b9dd83d 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -74,6 +74,12 @@
 #define DISABLE_SHSTK  (1 << (X86_FEATURE_SHSTK & 31))
 #endif
 
+#ifdef CONFIG_X86_IBT
+#define DISABLE_IBT0
+#else
+#define DISABLE_IBT(1 << (X86_FEATURE_IBT & 31))
+#endif
+
 #ifdef CONFIG_X86_CET
 #define DISABLE_CET0
 #else
@@ -103,7 +109,7 @@
 #define DISABLED_MASK16
(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
 DISABLE_ENQCMD|DISABLE_SHSTK)
 #define DISABLED_MASK170
-#define DISABLED_MASK180
+#define DISABLED_MASK18(DISABLE_IBT)
 #define DISABLED_MASK190
 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 20)
 
-- 
2.21.0



[PATCH v24 2/9] x86/cet/ibt: Add user-mode Indirect Branch Tracking support

2021-04-01 Thread Yu-cheng Yu
Introduce user-mode Indirect Branch Tracking (IBT) support.  Add routines
for the setup/disable of IBT.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Move IBT routines to a separate ibt.c, update related areas accordingly.

 arch/x86/include/asm/cet.h |  9 ++
 arch/x86/kernel/Makefile   |  1 +
 arch/x86/kernel/ibt.c  | 57 ++
 3 files changed, 67 insertions(+)
 create mode 100644 arch/x86/kernel/ibt.c

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 26124820d46f..b3df306699b4 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -15,6 +15,7 @@ struct cet_status {
unsigned long   shstk_base;
unsigned long   shstk_size;
unsigned intlocked:1;
+   unsigned intibt_enabled:1;
 };
 
 #ifdef CONFIG_X86_SHADOW_STACK
@@ -41,6 +42,14 @@ static inline int shstk_check_rstor_token(bool ia32, 
unsigned long token_addr,
  unsigned long *new_ssp) { return 0; }
 #endif
 
+#ifdef CONFIG_X86_IBT
+int ibt_setup(void);
+void ibt_disable(void);
+#else
+static inline int ibt_setup(void) { return 0; }
+static inline void ibt_disable(void) {}
+#endif
+
 #ifdef CONFIG_X86_CET
 int prctl_cet(int option, u64 arg2);
 #else
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 868cb3aac618..9653e422d0f3 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -152,6 +152,7 @@ obj-$(CONFIG_UNWINDER_GUESS)+= 
unwind_guess.o
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += sev-es.o
 obj-$(CONFIG_X86_SHADOW_STACK) += shstk.o
 obj-$(CONFIG_X86_CET)  += cet_prctl.o
+obj-$(CONFIG_X86_IBT)  += ibt.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/ibt.c b/arch/x86/kernel/ibt.c
new file mode 100644
index ..d2cef1a0345b
--- /dev/null
+++ b/arch/x86/kernel/ibt.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * ibt.c - Intel Indirect Branch Tracking support
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ * Yu-cheng Yu 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static void start_update_msrs(void)
+{
+   fpregs_lock();
+   if (test_thread_flag(TIF_NEED_FPU_LOAD))
+   __fpregs_load_activate();
+}
+
+static void end_update_msrs(void)
+{
+   fpregs_unlock();
+}
+
+int ibt_setup(void)
+{
+   u64 msr_val;
+
+   if (!cpu_feature_enabled(X86_FEATURE_IBT))
+   return -EOPNOTSUPP;
+
+   start_update_msrs();
+   rdmsrl(MSR_IA32_U_CET, msr_val);
+   msr_val |= (CET_ENDBR_EN | CET_NO_TRACK_EN);
+   wrmsrl(MSR_IA32_U_CET, msr_val);
+   end_update_msrs();
+   current->thread.cet.ibt_enabled = 1;
+   return 0;
+}
+
+void ibt_disable(void)
+{
+   u64 msr_val;
+
+   if (!cpu_feature_enabled(X86_FEATURE_IBT))
+   return;
+
+   start_update_msrs();
+   rdmsrl(MSR_IA32_U_CET, msr_val);
+   msr_val &= ~CET_ENDBR_EN;
+   wrmsrl(MSR_IA32_U_CET, msr_val);
+   end_update_msrs();
+   current->thread.cet.ibt_enabled = 0;
+}
-- 
2.21.0



[PATCH v24 0/9] Control-flow Enforcement: Indirect Branch Tracking

2021-04-01 Thread Yu-cheng Yu
Control-flow Enforcement (CET) is a new Intel processor feature that blocks
return/jump-oriented programming attacks.  Details are in "Intel 64 and
IA-32 Architectures Software Developer's Manual" [1].

This is the second part of CET and enables Indirect Branch Tracking (IBT).
It is built on top of the shadow stack series.

Changes in v24:
- Split IBT into a separate Kconfig option, update related areas
  accordingly.  Specific changes are called out in each patch's commit
  log.
- Patch #7: Update ENDBR definition with compiler macros.
- Rebase to Linus tree v5.12-rc5.

[1] Intel 64 and IA-32 Architectures Software Developer's Manual:

https://software.intel.com/en-us/download/intel-64-and-ia-32-
architectures-sdm-combined-volumes-1-2a-2b-2c-2d-3a-3b-3c-3d-and-4

[2] Indirect Branch Tracking patches v23:

https://lore.kernel.org/r/20210316151320.6123-1-yu-cheng...@intel.com/

H.J. Lu (3):
  x86/cet/ibt: Update arch_prctl functions for Indirect Branch Tracking
  x86/vdso: Insert endbr32/endbr64 to vDSO
  x86/vdso/32: Add ENDBR to __kernel_vsyscall entry point

Yu-cheng Yu (6):
  x86/cet/ibt: Add Kconfig option for Indirect Branch Tracking
  x86/cet/ibt: Add user-mode Indirect Branch Tracking support
  x86/cet/ibt: Handle signals for Indirect Branch Tracking
  x86/cet/ibt: Update ELF header parsing for Indirect Branch Tracking
  x86/vdso: Introduce ENDBR macro
  x86/vdso: Add ENDBR to __vdso_sgx_enter_enclave

 arch/x86/Kconfig | 22 +
 arch/x86/entry/vdso/Makefile |  5 +++
 arch/x86/entry/vdso/vdso32/system_call.S |  2 +
 arch/x86/entry/vdso/vsgx.S   |  4 ++
 arch/x86/include/asm/cet.h   |  9 
 arch/x86/include/asm/disabled-features.h |  8 +++-
 arch/x86/include/asm/vdso.h  | 19 +++-
 arch/x86/kernel/Makefile |  1 +
 arch/x86/kernel/cet_prctl.c  |  5 +++
 arch/x86/kernel/fpu/signal.c | 30 +++--
 arch/x86/kernel/ibt.c| 57 
 arch/x86/kernel/process_64.c |  8 
 12 files changed, 165 insertions(+), 5 deletions(-)
 create mode 100644 arch/x86/kernel/ibt.c

-- 
2.21.0



[PATCH v24 28/30] mm: Move arch_calc_vm_prot_bits() to arch/x86/include/asm/mman.h

2021-04-01 Thread Yu-cheng Yu
To prepare the introduction of PROT_SHSTK and be consistent with other
architectures, move arch_vm_get_page_prot() and arch_calc_vm_prot_bits() to
arch/x86/include/asm/mman.h.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Cc: Kirill A. Shutemov 
---
 arch/x86/include/asm/mman.h  | 30 ++
 arch/x86/include/uapi/asm/mman.h | 27 +++
 2 files changed, 33 insertions(+), 24 deletions(-)
 create mode 100644 arch/x86/include/asm/mman.h

diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
new file mode 100644
index ..629f6c81263a
--- /dev/null
+++ b/arch/x86/include/asm/mman.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MMAN_H
+#define _ASM_X86_MMAN_H
+
+#include 
+#include 
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+/*
+ * Take the 4 protection key bits out of the vma->vm_flags
+ * value and turn them in to the bits that we can put in
+ * to a pte.
+ *
+ * Only override these if Protection Keys are available
+ * (which is only on 64-bit).
+ */
+#define arch_vm_get_page_prot(vm_flags)__pgprot(   \
+   ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
+   ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
+   ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
+   ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
+
+#define arch_calc_vm_prot_bits(prot, key) (\
+   ((key) & 0x1 ? VM_PKEY_BIT0 : 0) |  \
+   ((key) & 0x2 ? VM_PKEY_BIT1 : 0) |  \
+   ((key) & 0x4 ? VM_PKEY_BIT2 : 0) |  \
+   ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#endif
+
+#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index d4a8d0424bfb..3ce1923e6ed9 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -1,31 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_X86_MMAN_H
-#define _ASM_X86_MMAN_H
+#ifndef _UAPI_ASM_X86_MMAN_H
+#define _UAPI_ASM_X86_MMAN_H
 
 #define MAP_32BIT  0x40/* only give out 32bit addresses */
 
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-/*
- * Take the 4 protection key bits out of the vma->vm_flags
- * value and turn them in to the bits that we can put in
- * to a pte.
- *
- * Only override these if Protection Keys are available
- * (which is only on 64-bit).
- */
-#define arch_vm_get_page_prot(vm_flags)__pgprot(   \
-   ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
-   ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
-   ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
-   ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
-
-#define arch_calc_vm_prot_bits(prot, key) (\
-   ((key) & 0x1 ? VM_PKEY_BIT0 : 0) |  \
-   ((key) & 0x2 ? VM_PKEY_BIT1 : 0) |  \
-   ((key) & 0x4 ? VM_PKEY_BIT2 : 0) |  \
-   ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
-#endif
 
 #include 
 
-#endif /* _ASM_X86_MMAN_H */
+#endif /* _UAPI_ASM_X86_MMAN_H */
-- 
2.21.0



[PATCH v24 30/30] mm: Introduce PROT_SHSTK for shadow stack

2021-04-01 Thread Yu-cheng Yu
There are three possible options to create a shadow stack allocation API:
an arch_prctl, a new syscall, or adding PROT_SHSTK to mmap()/mprotect().
Each has its advantages and compromises.

An arch_prctl() is the least intrusive.  However, the existing x86
arch_prctl() takes only two parameters.  Multiple parameters must be
passed in a memory buffer.  There is a proposal to pass more parameters in
registers [1], but no active discussion on that.

A new syscall minimizes compatibility issues and offers an extensible frame
work to other architectures, but this will likely result in some overlap of
mmap()/mprotect().

The introduction of PROT_SHSTK to mmap()/mprotect() takes advantage of
existing APIs.  The x86-specific PROT_SHSTK is translated to
VM_SHADOW_STACK and a shadow stack mapping is created without reinventing
the wheel.  There are potential pitfalls though.  The most obvious one
would be using this as a bypass to shadow stack protection.  However, the
attacker would have to get to the syscall first.

[1] https://lore.kernel.org/lkml/20200828121624.108243-1-hjl.to...@gmail.com/

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
Cc: Kirill A. Shutemov 
---
v24:
- Update arch_calc_vm_prot_bits(), leave PROT* checking to
  arch_validate_prot().
- Update arch_validate_prot(), leave vma flags checking to
  arch_validate_flags().
- Add arch_validate_flags().

 arch/x86/include/asm/mman.h  | 59 +++-
 arch/x86/include/uapi/asm/mman.h |  1 +
 include/linux/mm.h   |  1 +
 3 files changed, 60 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 629f6c81263a..1821c179f35d 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -20,11 +20,68 @@
((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
 
-#define arch_calc_vm_prot_bits(prot, key) (\
+#define pkey_vm_prot_bits(prot, key) ( \
((key) & 0x1 ? VM_PKEY_BIT0 : 0) |  \
((key) & 0x2 ? VM_PKEY_BIT1 : 0) |  \
((key) & 0x4 ? VM_PKEY_BIT2 : 0) |  \
((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#else
+#define pkey_vm_prot_bits(prot, key) (0)
 #endif
 
+static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+  unsigned long pkey)
+{
+   unsigned long vm_prot_bits = pkey_vm_prot_bits(prot, pkey);
+
+   if (prot & PROT_SHSTK)
+   vm_prot_bits |= VM_SHADOW_STACK;
+
+   return vm_prot_bits;
+}
+
+#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
+
+#ifdef CONFIG_X86_SHADOW_STACK
+static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
+{
+   unsigned long valid = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM |
+ PROT_SHSTK;
+
+   if (prot & ~valid)
+   return false;
+
+   if (prot & PROT_SHSTK) {
+   if (!current->thread.cet.shstk_size)
+   return false;
+
+   /*
+* A shadow stack mapping is indirectly writable by only
+* the CALL and WRUSS instructions, but not other write
+* instructions).  PROT_SHSTK and PROT_WRITE are mutually
+* exclusive.
+*/
+   if (prot & PROT_WRITE)
+   return false;
+   }
+
+   return true;
+}
+
+#define arch_validate_prot arch_validate_prot
+
+static inline bool arch_validate_flags(unsigned long vm_flags, bool is_anon)
+{
+   if (vm_flags & VM_SHADOW_STACK) {
+   if ((vm_flags & VM_SHARED) || !is_anon)
+   return false;
+   }
+
+   return true;
+}
+
+#define arch_validate_flags(vm_flags, is_anon) arch_validate_flags(vm_flags, 
is_anon)
+
+#endif /* CONFIG_X86_SHADOW_STACK */
+
 #endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 3ce1923e6ed9..39bb7db344a6 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -4,6 +4,7 @@
 
 #define MAP_32BIT  0x40/* only give out 32bit addresses */
 
+#define PROT_SHSTK 0x10/* shadow stack pages */
 
 #include 
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1ccec5cc399b..9a7652eea207 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -342,6 +342,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #if defined(CONFIG_X86)
 # define VM_PATVM_ARCH_1   /* PAT reserves whole VMA at 
once (x86) */
+# define VM_ARCH_CLEAR VM_SHADOW_STACK
 #elif defined(CONFIG_PPC)
 # define VM_SAOVM_ARCH_1   /* Strong Access Ordering 
(powerpc) */
 #elif defined(CONFIG_PARISC)
-- 
2.21.0



[PATCH v24 29/30] mm: Update arch_validate_flags() to include vma anonymous

2021-04-01 Thread Yu-cheng Yu
When newer VM flags are being created, such as VM_MTE, it becomes necessary
for mmap/mprotect to verify if certain flags are being applied to an
anonymous VMA.

To solve this, one approach is adding a VM flag to track that MAP_ANONYMOUS
is specified [1], and then using the flag in arch_validate_flags().

Another approach is passing vma_is_anonymous() to arch_validate_flags().
To prepare the introduction of PROT_SHSTK, which creates a shadow stack
mapping and can only be applied to an anonymous VMA, update arch_validate_
flags() to include anonymous VMA information.

[1] commit 9f3419315f3c ("arm64: mte: Add PROT_MTE support to mmap() and 
mprotect()"),

Signed-off-by: Yu-cheng Yu 
Cc: Catalin Marinas 
Cc: Kees Cook 
Cc: Kirill A. Shutemov 
Cc: Vincenzo Frascino 
Cc: Will Deacon 
---
 arch/arm64/include/asm/mman.h | 4 ++--
 arch/sparc/include/asm/mman.h | 4 ++--
 include/linux/mman.h  | 2 +-
 mm/mmap.c | 2 +-
 mm/mprotect.c | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h
index e3e28f7daf62..44add1a09041 100644
--- a/arch/arm64/include/asm/mman.h
+++ b/arch/arm64/include/asm/mman.h
@@ -74,7 +74,7 @@ static inline bool arch_validate_prot(unsigned long prot,
 }
 #define arch_validate_prot(prot, addr) arch_validate_prot(prot, addr)
 
-static inline bool arch_validate_flags(unsigned long vm_flags)
+static inline bool arch_validate_flags(unsigned long vm_flags, bool is_anon)
 {
if (!system_supports_mte())
return true;
@@ -82,6 +82,6 @@ static inline bool arch_validate_flags(unsigned long vm_flags)
/* only allow VM_MTE if VM_MTE_ALLOWED has been set previously */
return !(vm_flags & VM_MTE) || (vm_flags & VM_MTE_ALLOWED);
 }
-#define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags)
+#define arch_validate_flags(vm_flags, is_anon) arch_validate_flags(vm_flags, 
is_anon)
 
 #endif /* ! __ASM_MMAN_H__ */
diff --git a/arch/sparc/include/asm/mman.h b/arch/sparc/include/asm/mman.h
index 274217e7ed70..4a897c8a3f1a 100644
--- a/arch/sparc/include/asm/mman.h
+++ b/arch/sparc/include/asm/mman.h
@@ -60,11 +60,11 @@ static inline int sparc_validate_prot(unsigned long prot, 
unsigned long addr)
return 1;
 }
 
-#define arch_validate_flags(vm_flags) arch_validate_flags(vm_flags)
+#define arch_validate_flags(vm_flags, is_anon) arch_validate_flags(vm_flags, 
is_anon)
 /* arch_validate_flags() - Ensure combination of flags is valid for a
  * VMA.
  */
-static inline bool arch_validate_flags(unsigned long vm_flags)
+static inline bool arch_validate_flags(unsigned long vm_flags, bool is_anon)
 {
/* If ADI is being enabled on this VMA, check for ADI
 * capability on the platform and ensure VMA is suitable
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 629cefc4ecba..a22ed4495d13 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -114,7 +114,7 @@ static inline bool arch_validate_prot(unsigned long prot, 
unsigned long addr)
  *
  * Returns true if the VM_* flags are valid.
  */
-static inline bool arch_validate_flags(unsigned long flags)
+static inline bool arch_validate_flags(unsigned long flags, bool is_anonymous)
 {
return true;
 }
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b2992ef8ee0..db849e3ed9d3 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1850,7 +1850,7 @@ unsigned long mmap_region(struct file *file, unsigned 
long addr,
}
 
/* Allow architectures to sanity-check the vm_flags */
-   if (!arch_validate_flags(vma->vm_flags)) {
+   if (!arch_validate_flags(vma->vm_flags, vma_is_anonymous(vma))) {
error = -EINVAL;
if (file)
goto unmap_and_free_vma;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 550448dc5ff1..621d73e3667d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -611,7 +611,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
}
 
/* Allow architectures to sanity-check the new flags */
-   if (!arch_validate_flags(newflags)) {
+   if (!arch_validate_flags(newflags, vma_is_anonymous(vma))) {
error = -EINVAL;
goto out;
}
-- 
2.21.0



[PATCH v24 27/30] x86/cet/shstk: Add arch_prctl functions for shadow stack

2021-04-01 Thread Yu-cheng Yu
arch_prctl(ARCH_X86_CET_STATUS, u64 *args)
Get CET feature status.

The parameter 'args' is a pointer to a user buffer.  The kernel returns
the following information:

*args = shadow stack/IBT status
*(args + 1) = shadow stack base address
*(args + 2) = shadow stack size

32-bit binaries use the same interface, but only lower 32-bits of each
item.

arch_prctl(ARCH_X86_CET_DISABLE, unsigned int features)
Disable CET features specified in 'features'.  Return -EPERM if CET is
locked.

arch_prctl(ARCH_X86_CET_LOCK)
Lock in CET features.

Also change do_arch_prctl_common()'s parameter 'cpuid_enabled' to
'arg2', as it is now also passed to prctl_cet().

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
v24:
- Update #ifdef placement relating to shadow stack and ibt split.
- Update function names.

 arch/x86/include/asm/cet.h|  7 
 arch/x86/include/uapi/asm/prctl.h |  4 +++
 arch/x86/kernel/Makefile  |  1 +
 arch/x86/kernel/cet_prctl.c   | 60 +++
 arch/x86/kernel/process.c |  6 ++--
 5 files changed, 75 insertions(+), 3 deletions(-)
 create mode 100644 arch/x86/kernel/cet_prctl.c

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 5e66919bd2fe..26124820d46f 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -14,6 +14,7 @@ struct sc_ext;
 struct cet_status {
unsigned long   shstk_base;
unsigned long   shstk_size;
+   unsigned intlocked:1;
 };
 
 #ifdef CONFIG_X86_SHADOW_STACK
@@ -40,6 +41,12 @@ static inline int shstk_check_rstor_token(bool ia32, 
unsigned long token_addr,
  unsigned long *new_ssp) { return 0; }
 #endif
 
+#ifdef CONFIG_X86_CET
+int prctl_cet(int option, u64 arg2);
+#else
+static inline int prctl_cet(int option, u64 arg2) { return -EINVAL; }
+#endif
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_CET_H */
diff --git a/arch/x86/include/uapi/asm/prctl.h 
b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..9245bf629120 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -14,4 +14,8 @@
 #define ARCH_MAP_VDSO_32   0x2002
 #define ARCH_MAP_VDSO_64   0x2003
 
+#define ARCH_X86_CET_STATUS0x3001
+#define ARCH_X86_CET_DISABLE   0x3002
+#define ARCH_X86_CET_LOCK  0x3003
+
 #endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f99b093f350..868cb3aac618 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -151,6 +151,7 @@ obj-$(CONFIG_UNWINDER_GUESS)+= 
unwind_guess.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += sev-es.o
 obj-$(CONFIG_X86_SHADOW_STACK) += shstk.o
+obj-$(CONFIG_X86_CET)  += cet_prctl.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
new file mode 100644
index ..5f0054177d2a
--- /dev/null
+++ b/arch/x86/kernel/cet_prctl.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* See Documentation/x86/intel_cet.rst. */
+
+static int cet_copy_status_to_user(struct cet_status *cet, u64 __user *ubuf)
+{
+   u64 buf[3] = {};
+
+   if (cet->shstk_size) {
+   buf[0] |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+   buf[1] = cet->shstk_base;
+   buf[2] = cet->shstk_size;
+   }
+
+   return copy_to_user(ubuf, buf, sizeof(buf));
+}
+
+int prctl_cet(int option, u64 arg2)
+{
+   struct cet_status *cet;
+
+   if (!cpu_feature_enabled(X86_FEATURE_CET))
+   return -ENOTSUPP;
+
+   cet = ¤t->thread.cet;
+
+   if (option == ARCH_X86_CET_STATUS)
+   return cet_copy_status_to_user(cet, (u64 __user *)arg2);
+
+   switch (option) {
+   case ARCH_X86_CET_DISABLE:
+   if (cet->locked)
+   return -EPERM;
+
+   if (arg2 & ~GNU_PROPERTY_X86_FEATURE_1_VALID)
+   return -EINVAL;
+   if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+   shstk_disable();
+   return 0;
+
+   case ARCH_X86_CET_LOCK:
+   if (arg2)
+   return -EINVAL;
+   cet->locked = 1;
+   return 0;
+
+   default:
+   return -ENOSYS;
+   }
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index fa01e8679d01..315668a334fd 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -980,14 +980,14 @@ unsigned long get_wchan(struct task_struct *p)
 }
 
 long do_arch_prctl_common(struct task_struct *task, int option,
- unsigned long cpuid_enabled)
+ unsig

[PATCH v24 26/30] ELF: Introduce arch_setup_elf_property()

2021-04-01 Thread Yu-cheng Yu
An ELF file's .note.gnu.property indicates arch features supported by the
file.  These features are extracted by arch_parse_elf_property() and stored
in 'arch_elf_state'.

Introduce x86 feature definitions and arch_setup_elf_property(), which
enables such features.  The first use-case of this function is Shadow
Stack.

ARM64 is the other arch that has ARCH_USE_GNU_PROPERTY and arch_parse_elf_
property().  Add arch_setup_elf_property() for it.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Cc: Mark Brown 
Cc: Catalin Marinas 
Cc: Dave Martin 
---
v24:
- Change cet_setup_shstk() to shstk_setup() to reflect function name changes
  relating to the splitting of shadow stack and ibt.

 arch/arm64/include/asm/elf.h |  5 +
 arch/x86/Kconfig |  2 ++
 arch/x86/include/asm/elf.h   | 13 +
 arch/x86/kernel/process_64.c | 32 
 fs/binfmt_elf.c  |  4 
 include/linux/elf.h  |  6 ++
 include/uapi/linux/elf.h |  9 +
 7 files changed, 71 insertions(+)

diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 8d1c8dcb87fd..d37bc7915935 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -281,6 +281,11 @@ static inline int arch_parse_elf_property(u32 type, const 
void *data,
return 0;
 }
 
+static inline int arch_setup_elf_property(struct arch_elf_state *arch)
+{
+   return 0;
+}
+
 static inline int arch_elf_pt_proc(void *ehdr, void *phdr,
   struct file *f, bool is_interp,
   struct arch_elf_state *state)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index f42560b220ef..a69e351e7386 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1955,6 +1955,8 @@ config X86_SHADOW_STACK
depends on ARCH_HAS_SHADOW_STACK
select ARCH_USES_HIGH_VMA_FLAGS
select X86_CET
+   select ARCH_USE_GNU_PROPERTY
+   select ARCH_BINFMT_ELF_STATE
help
  Shadow Stack protection is a hardware feature that detects function
  return address corruption.  This helps mitigate ROP attacks.
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9224d40cdefe..6a131047be8a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -390,6 +390,19 @@ extern int compat_arch_setup_additional_pages(struct 
linux_binprm *bprm,
 
 extern bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs);
 
+#ifdef CONFIG_ARCH_BINFMT_ELF_STATE
+struct arch_elf_state {
+   unsigned int gnu_property;
+};
+
+#define INIT_ARCH_ELF_STATE {  \
+   .gnu_property = 0,  \
+}
+
+#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) (0)
+#define arch_check_elf(ehdr, interp, interp_ehdr, state) (0)
+#endif
+
 /* Do not change the values. See get_align_mask() */
 enum align_flags {
ALIGN_VA_32 = BIT(0),
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d08307df69ad..892d8e742e3b 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -835,3 +835,35 @@ unsigned long KSTK_ESP(struct task_struct *task)
 {
return task_pt_regs(task)->sp;
 }
+
+#ifdef CONFIG_ARCH_USE_GNU_PROPERTY
+int arch_parse_elf_property(u32 type, const void *data, size_t datasz,
+   bool compat, struct arch_elf_state *state)
+{
+   if (type != GNU_PROPERTY_X86_FEATURE_1_AND)
+   return 0;
+
+   if (datasz != sizeof(unsigned int))
+   return -ENOEXEC;
+
+   state->gnu_property = *(unsigned int *)data;
+   return 0;
+}
+
+int arch_setup_elf_property(struct arch_elf_state *state)
+{
+   int r = 0;
+
+   if (!IS_ENABLED(CONFIG_X86_CET))
+   return r;
+
+   memset(¤t->thread.cet, 0, sizeof(struct cet_status));
+
+   if (static_cpu_has(X86_FEATURE_SHSTK)) {
+   if (state->gnu_property & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+   r = shstk_setup();
+   }
+
+   return r;
+}
+#endif
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b12ba98ae9f5..fa665eceba04 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1248,6 +1248,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
set_binfmt(&elf_format);
 
+   retval = arch_setup_elf_property(&arch_state);
+   if (retval < 0)
+   goto out;
+
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter);
if (retval < 0)
diff --git a/include/linux/elf.h b/include/linux/elf.h
index c9a46c4e183b..be04d15e937f 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -92,9 +92,15 @@ static inline int arch_parse_elf_property(u32 type, const 
void *data,
 {
return 0;
 }
+
+static inline int arch_setup_elf_property(struct arch_elf_state *arch)
+{
+   return 0;
+}
 #else
 extern int arch_parse_elf_prope

[PATCH v24 25/30] x86/cet/shstk: Handle signals for shadow stack

2021-04-01 Thread Yu-cheng Yu
When shadow stack is enabled, a task's shadow stack states must be saved
along with the signal context and later restored in sigreturn.  However,
currently there is no systematic facility for extending a signal context.

Introduce a signal context extension struct 'sc_ext', which is used to save
shadow stack restore token address and WAIT_ENDBR status[1].  The extension
is located above the fpu states, plus alignment.

Introduce routines for the allocation, save, and restore for sc_ext:
- fpu__alloc_sigcontext_ext(),
- save_extra_state_to_sigframe(),
- get_extra_state_from_sigframe(),
- restore_extra_state().

[1] WAIT_ENDBR will be introduced later in the Indirect Branch Tracking
series, but add that into sc_ext now to keep the struct stable in case
the IBT series is applied later.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Split out shadow stack token routines to a separate patch.
- Put signal frame save/restore routines to fpu/signal.c and re-name 
accordingly.

 arch/x86/ia32/ia32_signal.c|  16 +++
 arch/x86/include/asm/cet.h |   2 +
 arch/x86/include/asm/fpu/internal.h|   2 +
 arch/x86/include/uapi/asm/sigcontext.h |   9 ++
 arch/x86/kernel/fpu/signal.c   | 143 +
 arch/x86/kernel/signal.c   |   9 ++
 6 files changed, 181 insertions(+)

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index 5e3d9b7fd5fb..96b87c5f0bbe 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -205,6 +205,7 @@ static void __user *get_sigframe(struct ksignal *ksig, 
struct pt_regs *regs,
 void __user **fpstate)
 {
unsigned long sp, fx_aligned, math_size;
+   void __user *restorer = NULL;
 
/* Default to using normal stack */
sp = regs->sp;
@@ -218,8 +219,23 @@ static void __user *get_sigframe(struct ksignal *ksig, 
struct pt_regs *regs,
 ksig->ka.sa.sa_restorer)
sp = (unsigned long) ksig->ka.sa.sa_restorer;
 
+   if (ksig->ka.sa.sa_flags & SA_RESTORER) {
+   restorer = ksig->ka.sa.sa_restorer;
+   } else if (current->mm->context.vdso) {
+   if (ksig->ka.sa.sa_flags & SA_SIGINFO)
+   restorer = current->mm->context.vdso +
+   vdso_image_32.sym___kernel_rt_sigreturn;
+   else
+   restorer = current->mm->context.vdso +
+   vdso_image_32.sym___kernel_sigreturn;
+   }
+
sp = fpu__alloc_mathframe(sp, 1, &fx_aligned, &math_size);
*fpstate = (struct _fpstate_32 __user *) sp;
+
+   if (save_extra_state_to_sigframe(1, *fpstate, (unsigned long)restorer))
+   return (void __user *)-1L;
+
if (copy_fpstate_to_sigframe(*fpstate, (void __user *)fx_aligned,
 math_size) < 0)
return (void __user *) -1L;
diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index ef6155213b7e..5e66919bd2fe 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -6,6 +6,8 @@
 #include 
 
 struct task_struct;
+struct sc_ext;
+
 /*
  * Per-thread CET status
  */
diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 8d33ad80704f..eb01eb6ea55d 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -443,6 +443,8 @@ static inline void copy_kernel_to_fpregs(union fpregs_state 
*fpstate)
__copy_kernel_to_fpregs(fpstate, -1);
 }
 
+extern int save_extra_state_to_sigframe(int ia32, void __user *fp,
+   unsigned long restorer);
 extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int 
size);
 
 /*
diff --git a/arch/x86/include/uapi/asm/sigcontext.h 
b/arch/x86/include/uapi/asm/sigcontext.h
index 844d60eb1882..cf2d55db3be4 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -196,6 +196,15 @@ struct _xstate {
/* New processor state extensions go here: */
 };
 
+/*
+ * Located at the end of sigcontext->fpstate, aligned to 8.
+ */
+struct sc_ext {
+   unsigned long total_size;
+   unsigned long ssp;
+   unsigned long wait_endbr;
+};
+
 /*
  * The 32-bit signal frame:
  */
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index a4ec65317a7f..2e56f2fe8be0 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -52,6 +52,123 @@ static inline int check_for_xstate(struct fxregs_state 
__user *buf,
return 0;
 }
 
+int save_extra_state_to_sigframe(int ia32, void __user *fp, unsigned long 
restorer)
+{
+   int err = 0;
+
+#ifdef CONFIG_X86_CET
+   struct cet_status *cet = ¤t->thread.cet;
+   unsigned long token_addr = 0, new_ssp = 0;
+ 

[PATCH v24 24/30] x86/cet/shstk: Introduce shadow stack token setup/verify routines

2021-04-01 Thread Yu-cheng Yu
A shadow stack restore token marks a restore point of the shadow stack, and
the address in a token must point directly above the token, which is within
the same shadow stack.  This is distinctively different from other pointers
on the shadow stack, since those pointers point to executable code area.

The restore token can be used as an extra protection for signal handling.
To deliver a signal, create a shadow stack restore token and put the token
and the signal restorer address on the shadow stack.  In sigreturn, verify
the token and restore from it the shadow stack pointer.

Introduce token setup and verify routines.  Also introduce WRUSS, which is
a kernel-mode instruction but writes directly to user shadow stack.  It is
used to construct user signal stack as described above.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
 arch/x86/include/asm/cet.h   |   9 ++
 arch/x86/include/asm/special_insns.h |  32 +++
 arch/x86/kernel/shstk.c  | 126 +++
 3 files changed, 167 insertions(+)

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 8b83ded577cc..ef6155213b7e 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -20,6 +20,10 @@ int shstk_setup_thread(struct task_struct *p, unsigned long 
clone_flags,
   unsigned long stack_size);
 void shstk_free(struct task_struct *p);
 void shstk_disable(void);
+int shstk_setup_rstor_token(bool ia32, unsigned long rstor,
+   unsigned long *token_addr, unsigned long *new_ssp);
+int shstk_check_rstor_token(bool ia32, unsigned long token_addr,
+   unsigned long *new_ssp);
 #else
 static inline int shstk_setup(void) { return 0; }
 static inline int shstk_setup_thread(struct task_struct *p,
@@ -27,6 +31,11 @@ static inline int shstk_setup_thread(struct task_struct *p,
 unsigned long stack_size) { return 0; }
 static inline void shstk_free(struct task_struct *p) {}
 static inline void shstk_disable(void) {}
+static inline int shstk_setup_rstor_token(bool ia32, unsigned long rstor,
+ unsigned long *token_addr,
+ unsigned long *new_ssp) { return 0; }
+static inline int shstk_check_rstor_token(bool ia32, unsigned long token_addr,
+ unsigned long *new_ssp) { return 0; }
 #endif
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/special_insns.h 
b/arch/x86/include/asm/special_insns.h
index 1d3cbaef4bb7..c41c371f6c7d 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -234,6 +234,38 @@ static inline void clwb(volatile void *__p)
: [pax] "a" (p));
 }
 
+#ifdef CONFIG_X86_SHADOW_STACK
+#if defined(CONFIG_IA32_EMULATION) || defined(CONFIG_X86_X32)
+static inline int write_user_shstk_32(unsigned long addr, unsigned int val)
+{
+   asm_volatile_goto("1: wrussd %1, (%0)\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: "r" (addr), "r" (val)
+ :: fail);
+   return 0;
+fail:
+   return -EPERM;
+}
+#else
+static inline int write_user_shstk_32(unsigned long addr, unsigned int val)
+{
+   WARN_ONCE(1, "%s used but not supported.\n", __func__);
+   return -EFAULT;
+}
+#endif
+
+static inline int write_user_shstk_64(unsigned long addr, unsigned long val)
+{
+   asm_volatile_goto("1: wrussq %1, (%0)\n"
+ _ASM_EXTABLE(1b, %l[fail])
+ :: "r" (addr), "r" (val)
+ :: fail);
+   return 0;
+fail:
+   return -EPERM;
+}
+#endif /* CONFIG_X86_SHADOW_STACK */
+
 #define nop() asm volatile ("nop")
 
 static inline void serialize(void)
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 9c80785535b9..6fa98b228ee3 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 
 static void start_update_msrs(void)
 {
@@ -181,3 +182,128 @@ void shstk_disable(void)
 
shstk_free(current);
 }
+
+static unsigned long _get_user_shstk_addr(void)
+{
+   struct fpu *fpu = ¤t->thread.fpu;
+   unsigned long ssp = 0;
+
+   fpregs_lock();
+
+   if (fpregs_state_valid(fpu, smp_processor_id())) {
+   rdmsrl(MSR_IA32_PL3_SSP, ssp);
+   } else {
+   struct cet_user_state *p;
+
+   p = get_xsave_addr(&fpu->state.xsave, XFEATURE_CET_USER);
+   if (p)
+   ssp = p->user_ssp;
+   }
+
+   fpregs_unlock();
+   return ssp;
+}
+
+#define TOKEN_MODE_MASK3UL
+#define TOKEN_MODE_64  1UL
+#define IS_TOKEN_64(token) (((token) & TOKEN_MODE_MASK) == TOKEN_MODE_64)
+#define IS_TOKEN_32(token) (((toke

[PATCH v24 23/30] x86/cet/shstk: Handle thread shadow stack

2021-04-01 Thread Yu-cheng Yu
The kernel allocates (and frees on thread exit) a new shadow stack for a
pthread child.

It is possible for the kernel to complete the clone syscall and set the
child's shadow stack pointer to NULL and let the child thread allocate
a shadow stack for itself.  There are two issues in this approach: It
is not compatible with existing code that does inline syscall and it
cannot handle signals before the child can successfully allocate a
shadow stack.

Use stack_size passed from clone3() syscall for thread shadow stack size,
but cap it to min(RLIMIT_STACK, 4 GB).  A compat-mode thread shadow stack
size is further reduced to 1/4.  This allows more threads to run in a 32-
bit address space.

Signed-off-by: Yu-cheng Yu 
---
 arch/x86/include/asm/cet.h |  5 +++
 arch/x86/include/asm/mmu_context.h |  3 ++
 arch/x86/kernel/process.c  | 15 ++--
 arch/x86/kernel/shstk.c| 57 +-
 4 files changed, 76 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index aa85d599b184..8b83ded577cc 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -16,10 +16,15 @@ struct cet_status {
 
 #ifdef CONFIG_X86_SHADOW_STACK
 int shstk_setup(void);
+int shstk_setup_thread(struct task_struct *p, unsigned long clone_flags,
+  unsigned long stack_size);
 void shstk_free(struct task_struct *p);
 void shstk_disable(void);
 #else
 static inline int shstk_setup(void) { return 0; }
+static inline int shstk_setup_thread(struct task_struct *p,
+unsigned long clone_flags,
+unsigned long stack_size) { return 0; }
 static inline void shstk_free(struct task_struct *p) {}
 static inline void shstk_disable(void) {}
 #endif
diff --git a/arch/x86/include/asm/mmu_context.h 
b/arch/x86/include/asm/mmu_context.h
index 27516046117a..53569114aa01 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -11,6 +11,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 extern atomic64_t last_mm_ctx_id;
@@ -146,6 +147,8 @@ do {\
 #else
 #define deactivate_mm(tsk, mm) \
 do {   \
+   if (!tsk->vfork_done)   \
+   shstk_free(tsk);\
load_gs_index(0);   \
loadsegment(fs, 0); \
 } while (0)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9c214d7085a4..fa01e8679d01 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "process.h"
 
@@ -109,6 +110,7 @@ void exit_thread(struct task_struct *tsk)
 
free_vm86(t);
 
+   shstk_free(tsk);
fpu__drop(fpu);
 }
 
@@ -122,8 +124,9 @@ static int set_new_tls(struct task_struct *p, unsigned long 
tls)
return do_set_thread_area_64(p, ARCH_SET_FS, tls);
 }
 
-int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg,
-   struct task_struct *p, unsigned long tls)
+int copy_thread(unsigned long clone_flags, unsigned long sp,
+   unsigned long stack_size, struct task_struct *p,
+   unsigned long tls)
 {
struct inactive_task_frame *frame;
struct fork_frame *fork_frame;
@@ -163,7 +166,7 @@ int copy_thread(unsigned long clone_flags, unsigned long 
sp, unsigned long arg,
/* Kernel thread ? */
if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
memset(childregs, 0, sizeof(struct pt_regs));
-   kthread_frame_init(frame, sp, arg);
+   kthread_frame_init(frame, sp, stack_size);
return 0;
}
 
@@ -181,6 +184,12 @@ int copy_thread(unsigned long clone_flags, unsigned long 
sp, unsigned long arg,
if (clone_flags & CLONE_SETTLS)
ret = set_new_tls(p, tls);
 
+#ifdef CONFIG_X86_64
+   /* Allocate a new shadow stack for pthread */
+   if (!ret)
+   ret = shstk_setup_thread(p, clone_flags, stack_size);
+#endif
+
if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
io_bitmap_share(p);
 
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
index 5406fdf6df3c..9c80785535b9 100644
--- a/arch/x86/kernel/shstk.c
+++ b/arch/x86/kernel/shstk.c
@@ -75,6 +75,55 @@ int shstk_setup(void)
return 0;
 }
 
+int shstk_setup_thread(struct task_struct *tsk, unsigned long clone_flags,
+  unsigned long stack_size)
+{
+   unsigned long addr, size;
+   struct cet_user_state *state;
+   struct cet_status *cet = &tsk->thread.cet;
+
+   if (!cet->shstk_size)
+   return 0;
+
+   if ((clon

[PATCH v24 22/30] x86/cet/shstk: Add user-mode shadow stack support

2021-04-01 Thread Yu-cheng Yu
Introduce basic shadow stack enabling/disabling/allocation routines.
A task's shadow stack is allocated from memory with VM_SHADOW_STACK flag
and has a fixed size of min(RLIMIT_STACK, 4GB).

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Rename cet.c to shstk.c, update related areas accordingly.

 arch/x86/include/asm/cet.h   |  29 +++
 arch/x86/include/asm/processor.h |   5 ++
 arch/x86/kernel/Makefile |   2 +
 arch/x86/kernel/shstk.c  | 128 +++
 4 files changed, 164 insertions(+)
 create mode 100644 arch/x86/include/asm/cet.h
 create mode 100644 arch/x86/kernel/shstk.c

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
new file mode 100644
index ..aa85d599b184
--- /dev/null
+++ b/arch/x86/include/asm/cet.h
@@ -0,0 +1,29 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CET_H
+#define _ASM_X86_CET_H
+
+#ifndef __ASSEMBLY__
+#include 
+
+struct task_struct;
+/*
+ * Per-thread CET status
+ */
+struct cet_status {
+   unsigned long   shstk_base;
+   unsigned long   shstk_size;
+};
+
+#ifdef CONFIG_X86_SHADOW_STACK
+int shstk_setup(void);
+void shstk_free(struct task_struct *p);
+void shstk_disable(void);
+#else
+static inline int shstk_setup(void) { return 0; }
+static inline void shstk_free(struct task_struct *p) {}
+static inline void shstk_disable(void) {}
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_CET_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f1b9ed5efaa9..a5d703fda74e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,6 +27,7 @@ struct vm86;
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -535,6 +536,10 @@ struct thread_struct {
 
unsigned intsig_on_uaccess_err:1;
 
+#ifdef CONFIG_X86_CET
+   struct cet_status   cet;
+#endif
+
/* Floating point and extended processor state */
struct fpu  fpu;
/*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ddf08351f0b..0f99b093f350 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -150,6 +150,8 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER)+= 
unwind_frame.o
 obj-$(CONFIG_UNWINDER_GUESS)   += unwind_guess.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += sev-es.o
+obj-$(CONFIG_X86_SHADOW_STACK) += shstk.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c
new file mode 100644
index ..5406fdf6df3c
--- /dev/null
+++ b/arch/x86/kernel/shstk.c
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * shstk.c - Intel shadow stack support
+ *
+ * Copyright (c) 2021, Intel Corporation.
+ * Yu-cheng Yu 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static void start_update_msrs(void)
+{
+   fpregs_lock();
+   if (test_thread_flag(TIF_NEED_FPU_LOAD))
+   __fpregs_load_activate();
+}
+
+static void end_update_msrs(void)
+{
+   fpregs_unlock();
+}
+
+static unsigned long alloc_shstk(unsigned long size, int flags)
+{
+   struct mm_struct *mm = current->mm;
+   unsigned long addr, populate;
+
+   /* VM_SHADOW_STACK requires MAP_ANONYMOUS, MAP_PRIVATE */
+   flags |= MAP_ANONYMOUS | MAP_PRIVATE;
+
+   mmap_write_lock(mm);
+   addr = do_mmap(NULL, 0, size, PROT_READ, flags, VM_SHADOW_STACK, 0,
+  &populate, NULL);
+   mmap_write_unlock(mm);
+
+   if (populate)
+   mm_populate(addr, populate);
+
+   return addr;
+}
+
+int shstk_setup(void)
+{
+   unsigned long addr, size;
+   struct cet_status *cet = ¤t->thread.cet;
+
+   if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+   return -EOPNOTSUPP;
+
+   size = round_up(min_t(unsigned long long, rlimit(RLIMIT_STACK), SZ_4G), 
PAGE_SIZE);
+   addr = alloc_shstk(size, 0);
+   if (IS_ERR_VALUE(addr))
+   return PTR_ERR((void *)addr);
+
+   cet->shstk_base = addr;
+   cet->shstk_size = size;
+
+   start_update_msrs();
+   wrmsrl(MSR_IA32_PL3_SSP, addr + size);
+   wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
+   end_update_msrs();
+   return 0;
+}
+
+void shstk_free(struct task_struct *tsk)
+{
+   struct cet_status *cet = &tsk->thread.cet;
+
+   if (!cpu_feature_enabled(X86_FEATURE_SHSTK) ||
+   !cet->shstk_size ||
+   !cet->shstk_base)
+   return;
+
+   if (!tsk->mm)
+   return;
+
+   while (1) {
+   int r;
+
+   r = vm_munmap(cet->shstk_base, cet->shstk_size);
+
+   /*
+* vm_munmap() returns -EINTR when mmap_lock is held by
+* something else, and that lock shoul

[PATCH v24 21/30] mm: Re-introduce vm_flags to do_mmap()

2021-04-01 Thread Yu-cheng Yu
There was no more caller passing vm_flags to do_mmap(), and vm_flags was
removed from the function's input by:

commit 45e55300f114 ("mm: remove unnecessary wrapper function 
do_mmap_pgoff()").

There is a new user now.  Shadow stack allocation passes VM_SHADOW_STACK to
do_mmap().  Thus, re-introduce vm_flags to do_mmap().

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Peter Collingbourne 
Reviewed-by: Kees Cook 
Cc: Andrew Morton 
Cc: Oleg Nesterov 
Cc: linux...@kvack.org
---
v24:
- Change VM_SHSTK to VM_SHADOW_STACK.
- Update commit log.

 fs/aio.c   |  2 +-
 include/linux/mm.h |  3 ++-
 ipc/shm.c  |  2 +-
 mm/mmap.c  | 10 +-
 mm/nommu.c |  4 ++--
 mm/util.c  |  2 +-
 6 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index 1f32da13d39e..b5d0586209a7 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -529,7 +529,7 @@ static int aio_setup_ring(struct kioctx *ctx, unsigned int 
nr_events)
 
ctx->mmap_base = do_mmap(ctx->aio_ring_file, 0, ctx->mmap_size,
 PROT_READ | PROT_WRITE,
-MAP_SHARED, 0, &unused, NULL);
+MAP_SHARED, 0, 0, &unused, NULL);
mmap_write_unlock(mm);
if (IS_ERR((void *)ctx->mmap_base)) {
ctx->mmap_size = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 3e9c84f21ef6..1ccec5cc399b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2576,7 +2576,8 @@ extern unsigned long mmap_region(struct file *file, 
unsigned long addr,
struct list_head *uf);
 extern unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long flags,
-   unsigned long pgoff, unsigned long *populate, struct list_head *uf);
+   vm_flags_t vm_flags, unsigned long pgoff, unsigned long *populate,
+   struct list_head *uf);
 extern int __do_munmap(struct mm_struct *, unsigned long, size_t,
   struct list_head *uf, bool downgrade);
 extern int do_munmap(struct mm_struct *, unsigned long, size_t,
diff --git a/ipc/shm.c b/ipc/shm.c
index febd88daba8c..b6370eb1eaab 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -1556,7 +1556,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg,
goto invalid;
}
 
-   addr = do_mmap(file, addr, size, prot, flags, 0, &populate, NULL);
+   addr = do_mmap(file, addr, size, prot, flags, 0, 0, &populate, NULL);
*raddr = addr;
err = 0;
if (IS_ERR_VALUE(addr))
diff --git a/mm/mmap.c b/mm/mmap.c
index d77fb39b6ab5..7b2992ef8ee0 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1401,11 +1401,11 @@ static inline bool file_mmap_ok(struct file *file, 
struct inode *inode,
  */
 unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
-   unsigned long flags, unsigned long pgoff,
-   unsigned long *populate, struct list_head *uf)
+   unsigned long flags, vm_flags_t vm_flags,
+   unsigned long pgoff, unsigned long *populate,
+   struct list_head *uf)
 {
struct mm_struct *mm = current->mm;
-   vm_flags_t vm_flags;
int pkey = 0;
 
*populate = 0;
@@ -1467,7 +1467,7 @@ unsigned long do_mmap(struct file *file, unsigned long 
addr,
 * to. we assume access permissions have been handled by the open
 * of the memory object, so we don't do any here.
 */
-   vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+   vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
 
if (flags & MAP_LOCKED)
@@ -3047,7 +3047,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, 
unsigned long, size,
 
file = get_file(vma->vm_file);
ret = do_mmap(vma->vm_file, start, size,
-   prot, flags, pgoff, &populate, NULL);
+   prot, flags, 0, pgoff, &populate, NULL);
fput(file);
 out:
mmap_write_unlock(mm);
diff --git a/mm/nommu.c b/mm/nommu.c
index 5c9ab799c0e6..9b6f7a1895c2 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1071,6 +1071,7 @@ unsigned long do_mmap(struct file *file,
unsigned long len,
unsigned long prot,
unsigned long flags,
+   vm_flags_t vm_flags,
unsigned long pgoff,
unsigned long *populate,
struct list_head *uf)
@@ -1078,7 +1079,6 @@ unsigned long do_mmap(struct file *file,
struct vm_area_struct *vma;
struct vm_region *region;
struct rb_node *rb;
-   vm_flags_t vm_flags

[PATCH v24 11/30] x86/mm: Update pte_modify for _PAGE_COW

2021-04-01 Thread Yu-cheng Yu
The read-only and Dirty PTE has been used to indicate copy-on-write pages.
However, newer x86 processors also regard a read-only and Dirty PTE as a
shadow stack page.  In order to separate the two, the software-defined
_PAGE_COW is created to replace _PAGE_DIRTY for the copy-on-write case, and
pte_*() are updated.

Pte_modify() changes a PTE to 'newprot', but it doesn't use the pte_*().
Introduce fixup_dirty_pte(), which sets a dirty PTE, based on _PAGE_RW,
to either _PAGE_DIRTY or _PAGE_COW.

Apply the same changes to pmd_modify().

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/pgtable.h | 37 ++
 1 file changed, 37 insertions(+)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 9c056d5815de..e1739f590ca6 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -799,6 +799,23 @@ static inline pmd_t pmd_mkinvalid(pmd_t pmd)
 
 static inline u64 flip_protnone_guard(u64 oldval, u64 val, u64 mask);
 
+static inline pteval_t fixup_dirty_pte(pteval_t pteval)
+{
+   pte_t pte = __pte(pteval);
+
+   /*
+* Fix up potential shadow stack page flags because the RO, Dirty
+* PTE is special.
+*/
+   if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+   if (pte_dirty(pte)) {
+   pte = pte_mkclean(pte);
+   pte = pte_mkdirty(pte);
+   }
+   }
+   return pte_val(pte);
+}
+
 static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
 {
pteval_t val = pte_val(pte), oldval = val;
@@ -809,16 +826,36 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t 
newprot)
 */
val &= _PAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_PAGE_CHG_MASK;
+   val = fixup_dirty_pte(val);
val = flip_protnone_guard(oldval, val, PTE_PFN_MASK);
return __pte(val);
 }
 
+static inline int pmd_write(pmd_t pmd);
+static inline pmdval_t fixup_dirty_pmd(pmdval_t pmdval)
+{
+   pmd_t pmd = __pmd(pmdval);
+
+   /*
+* Fix up potential shadow stack page flags because the RO, Dirty
+* PMD is special.
+*/
+   if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+   if (pmd_dirty(pmd)) {
+   pmd = pmd_mkclean(pmd);
+   pmd = pmd_mkdirty(pmd);
+   }
+   }
+   return pmd_val(pmd);
+}
+
 static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 {
pmdval_t val = pmd_val(pmd), oldval = val;
 
val &= _HPAGE_CHG_MASK;
val |= check_pgprot(newprot) & ~_HPAGE_CHG_MASK;
+   val = fixup_dirty_pmd(val);
val = flip_protnone_guard(oldval, val, PHYSICAL_PMD_PAGE_MASK);
return __pmd(val);
 }
-- 
2.21.0



[PATCH v24 16/30] mm: Fixup places that call pte_mkwrite() directly

2021-04-01 Thread Yu-cheng Yu
When serving a page fault, maybe_mkwrite() makes a PTE writable if it is in
a writable vma.  A shadow stack vma is writable, but its PTEs need
_PAGE_DIRTY to be set to become writable.  For this reason, maybe_mkwrite()
has been updated.

There are a few places that call pte_mkwrite() directly, but have the
same result as from maybe_mkwrite().  These sites need to be updated for
shadow stack as well.  Thus, change them to maybe_mkwrite():

- do_anonymous_page() and migrate_vma_insert_page() check VM_WRITE directly
  and call pte_mkwrite(), which is the same as maybe_mkwrite().  Change
  them to maybe_mkwrite().

- In do_numa_page(), if the numa entry was writable, then pte_mkwrite()
  is called directly.  Fix it by doing maybe_mkwrite().

- In change_pte_range(), pte_mkwrite() is called directly.  Replace it with
  maybe_mkwrite().

  A shadow stack vma is writable but has different vma
flags, and handled accordingly in maybe_mkwrite().

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Cc: Kirill A. Shutemov 
---
 mm/memory.c   | 5 ++---
 mm/migrate.c  | 3 +--
 mm/mprotect.c | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 5efa07fb6cdc..c70c3847f79d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3561,8 +3561,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
__SetPageUptodate(page);
 
entry = mk_pte(page, vma->vm_page_prot);
-   if (vma->vm_flags & VM_WRITE)
-   entry = pte_mkwrite(pte_mkdirty(entry));
+   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
@@ -4125,7 +4124,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
pte = pte_modify(old_pte, vma->vm_page_prot);
pte = pte_mkyoung(pte);
if (was_writable)
-   pte = pte_mkwrite(pte);
+   pte = maybe_mkwrite(pte, vma);
ptep_modify_prot_commit(vma, vmf->address, vmf->pte, old_pte, pte);
update_mmu_cache(vma, vmf->address, vmf->pte);
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 62b81d5257aa..7251c88a3d64 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2976,8 +2976,7 @@ static void migrate_vma_insert_page(struct migrate_vma 
*migrate,
}
} else {
entry = mk_pte(page, vma->vm_page_prot);
-   if (vma->vm_flags & VM_WRITE)
-   entry = pte_mkwrite(pte_mkdirty(entry));
+   entry = maybe_mkwrite(pte_mkdirty(entry), vma);
}
 
ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 94188df1ee55..c1ce78d688b6 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -135,7 +135,7 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
if (dirty_accountable && pte_dirty(ptent) &&
(pte_soft_dirty(ptent) ||
 !(vma->vm_flags & VM_SOFTDIRTY))) {
-   ptent = pte_mkwrite(ptent);
+   ptent = maybe_mkwrite(ptent, vma);
}
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
pages++;
-- 
2.21.0



[PATCH v24 20/30] mm/mprotect: Exclude shadow stack from preserve_write

2021-04-01 Thread Yu-cheng Yu
In change_pte_range(), when a PTE is changed for prot_numa, _PAGE_RW is
preserved to avoid the additional write fault after the NUMA hinting fault.
However, pte_write() now includes both normal writable and shadow stack
(RW=0, Dirty=1) PTEs, but the latter does not have _PAGE_RW and has no need
to preserve it.

Exclude shadow stack from preserve_write test, and apply the same change to
change_huge_pmd().

Signed-off-by: Yu-cheng Yu 
Cc: Kirill A. Shutemov 
---
v24:
- Change arch_shadow_stack_mapping() to is_shadow_stack_mapping().

 mm/huge_memory.c | 7 ++-
 mm/mprotect.c| 9 -
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 65fc0aedd577..1d41138c4f74 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1812,12 +1812,17 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t 
*pmd,
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
+   bool shstk = is_shadow_stack_mapping(vma->vm_flags);
 
ptl = __pmd_trans_huge_lock(pmd, vma);
if (!ptl)
return 0;
 
-   preserve_write = prot_numa && pmd_write(*pmd);
+   /*
+* Preserve only normal writable huge PMD, but not shadow
+* stack (RW=0, Dirty=1).
+*/
+   preserve_write = prot_numa && pmd_write(*pmd) && !shstk;
ret = 1;
 
 #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c1ce78d688b6..550448dc5ff1 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,7 +75,14 @@ static unsigned long change_pte_range(struct vm_area_struct 
*vma, pmd_t *pmd,
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
-   bool preserve_write = prot_numa && pte_write(oldpte);
+   bool shstk = is_shadow_stack_mapping(vma->vm_flags);
+   bool preserve_write;
+
+   /*
+* Preserve only normal writable PTE, but not shadow
+* stack (RW=0, Dirty=1).
+*/
+   preserve_write = prot_numa && pte_write(oldpte) && 
!shstk;
 
/*
 * Avoid trapping faults against the zero or KSM
-- 
2.21.0



[PATCH v24 18/30] mm/mmap: Add shadow stack pages to memory accounting

2021-04-01 Thread Yu-cheng Yu
Account shadow stack pages to stack memory.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
Cc: Kirill A. Shutemov 
---
v24:
- Change arch_shadow_stack_mapping() to is_shadow_stack_mapping().
- Change VM_SHSTK to VM_SHADOW_STACK.

 arch/x86/mm/pgtable.c   |  7 +++
 include/linux/pgtable.h | 11 +++
 mm/mmap.c   |  5 +
 3 files changed, 23 insertions(+)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e778dbbef3d8..212a8c1fe5ba 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -897,3 +897,10 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 
 #endif /* CONFIG_X86_64 */
 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+#ifdef CONFIG_ARCH_HAS_SHADOW_STACK
+bool is_shadow_stack_mapping(vm_flags_t vm_flags)
+{
+   return (vm_flags & VM_SHADOW_STACK);
+}
+#endif
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index 5e772392a379..0070a6d5c272 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1446,6 +1446,17 @@ static inline bool arch_has_pfn_modify_check(void)
 }
 #endif /* !_HAVE_ARCH_PFN_MODIFY_ALLOWED */
 
+#ifdef CONFIG_MMU
+#ifdef CONFIG_ARCH_HAS_SHADOW_STACK
+bool is_shadow_stack_mapping(vm_flags_t vm_flags);
+#else
+static inline bool is_shadow_stack_mapping(vm_flags_t vm_flags)
+{
+   return false;
+}
+#endif /* CONFIG_ARCH_HAS_SHADOW_STACK */
+#endif /* CONFIG_MMU */
+
 /*
  * Architecture PAGE_KERNEL_* fallbacks
  *
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f287599a7a3..d77fb39b6ab5 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1718,6 +1718,9 @@ static inline int accountable_mapping(struct file *file, 
vm_flags_t vm_flags)
if (file && is_file_hugepages(file))
return 0;
 
+   if (is_shadow_stack_mapping(vm_flags))
+   return 1;
+
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
@@ -3387,6 +3390,8 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t 
flags, long npages)
mm->stack_vm += npages;
else if (is_data_mapping(flags))
mm->data_vm += npages;
+   else if (is_shadow_stack_mapping(flags))
+   mm->stack_vm += npages;
 }
 
 static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
-- 
2.21.0



[PATCH v24 15/30] x86/mm: Update maybe_mkwrite() for shadow stack

2021-04-01 Thread Yu-cheng Yu
When serving a page fault, maybe_mkwrite() makes a PTE writable if its vma
has VM_WRITE.

A shadow stack vma has VM_SHADOW_STACK.  Its PTEs have _PAGE_DIRTY, but not
_PAGE_WRITE.  In fork(), _PAGE_DIRTY is cleared to cause copy-on-write,
and in the page fault handler, _PAGE_DIRTY is restored and the shadow stack
page is writable again.

Introduce an x86 version of maybe_mkwrite(), which sets proper PTE bits
according to VM flags.

Apply the same changes to maybe_pmd_mkwrite().

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
Cc: Kirill A. Shutemov 
---
v24:
- Instead of doing arch_maybe_mkwrite(), overwrite maybe*_mkwrite() with x86
  versions.
- Change VM_SHSTK to VM_SHADOW_STACK.

 arch/x86/include/asm/pgtable.h |  8 
 arch/x86/mm/pgtable.c  | 20 
 include/linux/mm.h |  2 ++
 mm/huge_memory.c   |  2 ++
 4 files changed, 32 insertions(+)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 46d9394b884f..51cdf14488b7 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1686,6 +1686,14 @@ static inline bool arch_faults_on_old_pte(void)
return false;
 }
 
+#define maybe_mkwrite maybe_mkwrite
+extern pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma);
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define maybe_pmd_mkwrite maybe_pmd_mkwrite
+extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_X86_PGTABLE_H */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index f6a9e2e36642..e778dbbef3d8 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -610,6 +610,26 @@ int pmdp_clear_flush_young(struct vm_area_struct *vma,
 }
 #endif
 
+pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+{
+   if (likely(vma->vm_flags & VM_WRITE))
+   pte = pte_mkwrite(pte);
+   else if (likely(vma->vm_flags & VM_SHADOW_STACK))
+   pte = pte_mkwrite_shstk(pte);
+   return pte;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
+{
+   if (likely(vma->vm_flags & VM_WRITE))
+   pmd = pmd_mkwrite(pmd);
+   else if (likely(vma->vm_flags & VM_SHADOW_STACK))
+   pmd = pmd_mkwrite_shstk(pmd);
+   return pmd;
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
 /**
  * reserve_top_address - reserves a hole in the top of kernel address space
  * @reserve - size of hole to reserve
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 08282eb2f195..6ac9b3e9a865 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -993,12 +993,14 @@ void free_compound_page(struct page *page);
  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
  * that do not have writing enabled, when used by access_process_vm.
  */
+#ifndef maybe_mkwrite
 static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 {
if (likely(vma->vm_flags & VM_WRITE))
pte = pte_mkwrite(pte);
return pte;
 }
+#endif
 
 vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page);
 void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ae907a9c2050..8203bd6ae4bd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -478,12 +478,14 @@ static int __init setup_transparent_hugepage(char *str)
 }
 __setup("transparent_hugepage=", setup_transparent_hugepage);
 
+#ifndef maybe_pmd_mkwrite
 pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
 {
if (likely(vma->vm_flags & VM_WRITE))
pmd = pmd_mkwrite(pmd);
return pmd;
 }
+#endif
 
 #ifdef CONFIG_MEMCG
 static inline struct deferred_split *get_deferred_split_queue(struct page 
*page)
-- 
2.21.0



[PATCH v24 19/30] mm: Update can_follow_write_pte() for shadow stack

2021-04-01 Thread Yu-cheng Yu
Can_follow_write_pte() ensures a read-only page is COWed by checking the
FOLL_COW flag, and uses pte_dirty() to validate the flag is still valid.

Like a writable data page, a shadow stack page is writable, and becomes
read-only during copy-on-write, but it is always dirty.  Thus, in the
can_follow_write_pte() check, it belongs to the writable page case and
should be excluded from the read-only page pte_dirty() check.  Apply
the same changes to can_follow_write_pmd().

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Cc: Kirill A. Shutemov 
---
v24:
- Change arch_shadow_stack_mapping() to is_shadow_stack_mapping().

 mm/gup.c | 8 +---
 mm/huge_memory.c | 8 +---
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/mm/gup.c b/mm/gup.c
index e40579624f10..c313cc988865 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -356,10 +356,12 @@ static int follow_pfn_pte(struct vm_area_struct *vma, 
unsigned long address,
  * FOLL_FORCE can write to even unwritable pte's, but only
  * after we've gone through a COW cycle and they are dirty.
  */
-static inline bool can_follow_write_pte(pte_t pte, unsigned int flags)
+static inline bool can_follow_write_pte(pte_t pte, unsigned int flags,
+   struct vm_area_struct *vma)
 {
return pte_write(pte) ||
-   ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte));
+   ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte) &&
+ !is_shadow_stack_mapping(vma->vm_flags));
 }
 
 static struct page *follow_page_pte(struct vm_area_struct *vma,
@@ -402,7 +404,7 @@ static struct page *follow_page_pte(struct vm_area_struct 
*vma,
}
if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
-   if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) {
+   if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags, vma)) {
pte_unmap_unlock(ptep, ptl);
return NULL;
}
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 8203bd6ae4bd..65fc0aedd577 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1338,10 +1338,12 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, 
pmd_t orig_pmd)
  * FOLL_FORCE can write to even unwritable pmd's, but only
  * after we've gone through a COW cycle and they are dirty.
  */
-static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags)
+static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags,
+   struct vm_area_struct *vma)
 {
return pmd_write(pmd) ||
-  ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd));
+  ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd) &&
+ !is_shadow_stack_mapping(vma->vm_flags));
 }
 
 struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
@@ -1354,7 +1356,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct 
*vma,
 
assert_spin_locked(pmd_lockptr(mm, pmd));
 
-   if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags))
+   if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags, vma))
goto out;
 
/* Avoid dumping huge zero page */
-- 
2.21.0



[PATCH v24 17/30] mm: Add guard pages around a shadow stack.

2021-04-01 Thread Yu-cheng Yu
INCSSP(Q/D) increments shadow stack pointer and 'pops and discards' the
first and the last elements in the range, effectively touches those memory
areas.

The maximum moving distance by INCSSPQ is 255 * 8 = 2040 bytes and
255 * 4 = 1020 bytes by INCSSPD.  Both ranges are far from PAGE_SIZE.
Thus, putting a gap page on both ends of a shadow stack prevents INCSSP,
CALL, and RET from going beyond.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
Cc: Kirill A. Shutemov 
---
v24:
- Instead changing vm_*_gap(), create x86-specific versions.

 arch/x86/include/asm/page_types.h | 17 +++
 arch/x86/mm/mmap.c| 36 +++
 include/linux/mm.h|  4 
 3 files changed, 57 insertions(+)

diff --git a/arch/x86/include/asm/page_types.h 
b/arch/x86/include/asm/page_types.h
index a506a411474d..3a5529bcfd76 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -73,6 +73,23 @@ bool pfn_range_is_mapped(unsigned long start_pfn, unsigned 
long end_pfn);
 
 extern void initmem_init(void);
 
+/*
+ * Shadow stack pointer is moved by CALL, RET, and INCSSP(Q/D).  INCSSPQ
+ * moves shadow stack pointer up to 255 * 8 = ~2 KB (~1KB for INCSSPD) and
+ * touches the first and the last element in the range, which triggers a
+ * page fault if the range is not in a shadow stack.  Because of this,
+ * creating 4-KB guard pages around a shadow stack prevents these
+ * instructions from going beyond.
+ */
+#define SHADOW_STACK_GUARD_GAP PAGE_SIZE
+
+#define vm_start_gap vm_start_gap
+struct vm_area_struct;
+extern unsigned long vm_start_gap(struct vm_area_struct *vma);
+
+#define vm_end_gap vm_end_gap
+extern unsigned long vm_end_gap(struct vm_area_struct *vma);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_PAGE_DEFS_H */
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index f3f52c5e2fd6..e714ddc8f3f7 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -250,3 +250,39 @@ bool pfn_modify_allowed(unsigned long pfn, pgprot_t prot)
return false;
return true;
 }
+
+unsigned long vm_start_gap(struct vm_area_struct *vma)
+{
+   unsigned long vm_start = vma->vm_start;
+   unsigned long gap = 0;
+
+   if (vma->vm_flags & VM_GROWSDOWN)
+   gap = stack_guard_gap;
+   else if (vma->vm_flags & VM_SHADOW_STACK)
+   gap = SHADOW_STACK_GUARD_GAP;
+
+   if (gap != 0) {
+   vm_start -= gap;
+   if (vm_start > vma->vm_start)
+   vm_start = 0;
+   }
+   return vm_start;
+}
+
+unsigned long vm_end_gap(struct vm_area_struct *vma)
+{
+   unsigned long vm_end = vma->vm_end;
+   unsigned long gap = 0;
+
+   if (vma->vm_flags & VM_GROWSUP)
+   gap = stack_guard_gap;
+   else if (vma->vm_flags & VM_SHADOW_STACK)
+   gap = SHADOW_STACK_GUARD_GAP;
+
+   if (gap != 0) {
+   vm_end += gap;
+   if (vm_end < vma->vm_end)
+   vm_end = -PAGE_SIZE;
+   }
+   return vm_end;
+}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 6ac9b3e9a865..3e9c84f21ef6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2660,6 +2660,7 @@ static inline struct vm_area_struct * 
find_vma_intersection(struct mm_struct * m
return vma;
 }
 
+#ifndef vm_start_gap
 static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
 {
unsigned long vm_start = vma->vm_start;
@@ -2671,7 +2672,9 @@ static inline unsigned long vm_start_gap(struct 
vm_area_struct *vma)
}
return vm_start;
 }
+#endif
 
+#ifndef vm_end_gap
 static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
 {
unsigned long vm_end = vma->vm_end;
@@ -2683,6 +2686,7 @@ static inline unsigned long vm_end_gap(struct 
vm_area_struct *vma)
}
return vm_end;
 }
+#endif
 
 static inline unsigned long vma_pages(struct vm_area_struct *vma)
 {
-- 
2.21.0



[PATCH v24 12/30] x86/mm: Update ptep_set_wrprotect() and pmdp_set_wrprotect() for transition from _PAGE_DIRTY to _PAGE_COW

2021-04-01 Thread Yu-cheng Yu
When Shadow Stack is introduced, [R/O + _PAGE_DIRTY] PTE is reserved for
shadow stack.  Copy-on-write PTEs have [R/O + _PAGE_COW].

When a PTE goes from [R/W + _PAGE_DIRTY] to [R/O + _PAGE_COW], it could
become a transient shadow stack PTE in two cases:

The first case is that some processors can start a write but end up seeing
a read-only PTE by the time they get to the Dirty bit, creating a transient
shadow stack PTE.  However, this will not occur on processors supporting
Shadow Stack, and a TLB flush is not necessary.

The second case is that when _PAGE_DIRTY is replaced with _PAGE_COW non-
atomically, a transient shadow stack PTE can be created as a result.
Thus, prevent that with cmpxchg.

Dave Hansen, Jann Horn, Andy Lutomirski, and Peter Zijlstra provided many
insights to the issue.  Jann Horn provided the cmpxchg solution.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Cc: Kirill A. Shutemov 
---
 arch/x86/include/asm/pgtable.h | 36 ++
 1 file changed, 36 insertions(+)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index e1739f590ca6..46d9394b884f 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1306,6 +1306,24 @@ static inline pte_t ptep_get_and_clear_full(struct 
mm_struct *mm,
 static inline void ptep_set_wrprotect(struct mm_struct *mm,
  unsigned long addr, pte_t *ptep)
 {
+   /*
+* If Shadow Stack is enabled, pte_wrprotect() moves _PAGE_DIRTY
+* to _PAGE_COW (see comments at pte_wrprotect()).
+* When a thread reads a RW=1, Dirty=0 PTE and before changing it
+* to RW=0, Dirty=0, another thread could have written to the page
+* and the PTE is RW=1, Dirty=1 now.  Use try_cmpxchg() to detect
+* PTE changes and update old_pte, then try again.
+*/
+   if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+   pte_t old_pte, new_pte;
+
+   old_pte = READ_ONCE(*ptep);
+   do {
+   new_pte = pte_wrprotect(old_pte);
+   } while (!try_cmpxchg(&ptep->pte, &old_pte.pte, new_pte.pte));
+
+   return;
+   }
clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte);
 }
 
@@ -1350,6 +1368,24 @@ static inline pud_t pudp_huge_get_and_clear(struct 
mm_struct *mm,
 static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  unsigned long addr, pmd_t *pmdp)
 {
+   /*
+* If Shadow Stack is enabled, pmd_wrprotect() moves _PAGE_DIRTY
+* to _PAGE_COW (see comments at pmd_wrprotect()).
+* When a thread reads a RW=1, Dirty=0 PMD and before changing it
+* to RW=0, Dirty=0, another thread could have written to the page
+* and the PMD is RW=1, Dirty=1 now.  Use try_cmpxchg() to detect
+* PMD changes and update old_pmd, then try again.
+*/
+   if (cpu_feature_enabled(X86_FEATURE_SHSTK)) {
+   pmd_t old_pmd, new_pmd;
+
+   old_pmd = READ_ONCE(*pmdp);
+   do {
+   new_pmd = pmd_wrprotect(old_pmd);
+   } while (!try_cmpxchg((pmdval_t *)pmdp, (pmdval_t *)&old_pmd, 
pmd_val(new_pmd)));
+
+   return;
+   }
clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
 }
 
-- 
2.21.0



[PATCH v24 10/30] drm/i915/gvt: Change _PAGE_DIRTY to _PAGE_DIRTY_BITS

2021-04-01 Thread Yu-cheng Yu
After the introduction of _PAGE_COW, a modified page's PTE can have either
_PAGE_DIRTY or _PAGE_COW.  Change _PAGE_DIRTY to _PAGE_DIRTY_BITS.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Reviewed-by: Kirill A. Shutemov 
Cc: David Airlie 
Cc: Joonas Lahtinen 
Cc: Jani Nikula 
Cc: Daniel Vetter 
Cc: Rodrigo Vivi 
Cc: Zhenyu Wang 
Cc: Zhi Wang 
---
 drivers/gpu/drm/i915/gvt/gtt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/gpu/drm/i915/gvt/gtt.c b/drivers/gpu/drm/i915/gvt/gtt.c
index 897c007ea96a..937b6083b2dc 100644
--- a/drivers/gpu/drm/i915/gvt/gtt.c
+++ b/drivers/gpu/drm/i915/gvt/gtt.c
@@ -1216,7 +1216,7 @@ static int split_2MB_gtt_entry(struct intel_vgpu *vgpu,
}
 
/* Clear dirty field. */
-   se->val64 &= ~_PAGE_DIRTY;
+   se->val64 &= ~_PAGE_DIRTY_BITS;
 
ops->clear_pse(se);
ops->clear_ips(se);
-- 
2.21.0



[PATCH v24 13/30] mm: Introduce VM_SHADOW_STACK for shadow stack memory

2021-04-01 Thread Yu-cheng Yu
A shadow stack PTE must be read-only and have _PAGE_DIRTY set.  However,
read-only and Dirty PTEs also exist for copy-on-write (COW) pages.  These
two cases are handled differently for page faults.  Introduce
VM_SHADOW_STACK to track shadow stack VMAs.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
Cc: Kirill A. Shutemov 
---
v24:
- Change VM_SHSTK to VM_SHADOW_STACK.
- Change CONFIG_X86_CET to CONFIG_X86_SHADOW_STACK to reflect Kconfig changes.

 Documentation/filesystems/proc.rst | 1 +
 arch/x86/mm/mmap.c | 2 ++
 fs/proc/task_mmu.c | 3 +++
 include/linux/mm.h | 8 
 4 files changed, 14 insertions(+)

diff --git a/Documentation/filesystems/proc.rst 
b/Documentation/filesystems/proc.rst
index 48fbfc336ebf..5d8a2d75c799 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -549,6 +549,7 @@ encoded manner. The codes are the following:
 mgmergable advise flag
 btarm64 BTI guarded page
 mtarm64 MTE allocation tags are enabled
+ssshadow stack page
 =====
 
 Note that there is no guarantee that every flag and associated mnemonic will
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c90c20904a60..f3f52c5e2fd6 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -165,6 +165,8 @@ unsigned long get_mmap_base(int is_legacy)
 
 const char *arch_vma_name(struct vm_area_struct *vma)
 {
+   if (vma->vm_flags & VM_SHADOW_STACK)
+   return "[shadow stack]";
return NULL;
 }
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e862cab69583..0aa57de9dfab 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct 
vm_area_struct *vma)
[ilog2(VM_PKEY_BIT4)]   = "",
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_ARCH_HAS_SHADOW_STACK
+   [ilog2(VM_SHADOW_STACK)]= "ss",
+#endif
};
size_t i;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 8ba434287387..08282eb2f195 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -312,11 +312,13 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_2 34  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_BIT_3 35  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_BIT_4 36  /* bit only usable on 64-bit 
architectures */
+#define VM_HIGH_ARCH_BIT_5 37  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
 #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
@@ -332,6 +334,12 @@ extern unsigned int kobjsize(const void *objp);
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
 
+#ifdef CONFIG_X86_SHADOW_STACK
+# define VM_SHADOW_STACK   VM_HIGH_ARCH_5
+#else
+# define VM_SHADOW_STACK   VM_NONE
+#endif
+
 #if defined(CONFIG_X86)
 # define VM_PATVM_ARCH_1   /* PAT reserves whole VMA at 
once (x86) */
 #elif defined(CONFIG_PPC)
-- 
2.21.0



[PATCH v24 08/30] x86/mm: Move pmd_write(), pud_write() up in the file

2021-04-01 Thread Yu-cheng Yu
To prepare the introduction of _PAGE_COW, move pmd_write() and
pud_write() up in the file, so that they can be used by other
helpers below.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kirill A. Shutemov 
---
 arch/x86/include/asm/pgtable.h | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a02c67291cfc..c1650d0af1b5 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -185,6 +185,18 @@ static inline int pte_write(pte_t pte)
return pte_flags(pte) & _PAGE_RW;
 }
 
+#define pmd_write pmd_write
+static inline int pmd_write(pmd_t pmd)
+{
+   return pmd_flags(pmd) & _PAGE_RW;
+}
+
+#define pud_write pud_write
+static inline int pud_write(pud_t pud)
+{
+   return pud_flags(pud) & _PAGE_RW;
+}
+
 static inline int pte_huge(pte_t pte)
 {
return pte_flags(pte) & _PAGE_PSE;
@@ -1128,12 +1140,6 @@ extern int pmdp_clear_flush_young(struct vm_area_struct 
*vma,
  unsigned long address, pmd_t *pmdp);
 
 
-#define pmd_write pmd_write
-static inline int pmd_write(pmd_t pmd)
-{
-   return pmd_flags(pmd) & _PAGE_RW;
-}
-
 #define __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
 static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned 
long addr,
   pmd_t *pmdp)
@@ -1155,12 +1161,6 @@ static inline void pmdp_set_wrprotect(struct mm_struct 
*mm,
clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
 }
 
-#define pud_write pud_write
-static inline int pud_write(pud_t pud)
-{
-   return pud_flags(pud) & _PAGE_RW;
-}
-
 #ifndef pmdp_establish
 #define pmdp_establish pmdp_establish
 static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
-- 
2.21.0



[PATCH v24 09/30] x86/mm: Introduce _PAGE_COW

2021-04-01 Thread Yu-cheng Yu
There is essentially no room left in the x86 hardware PTEs on some OSes
(not Linux).  That left the hardware architects looking for a way to
represent a new memory type (shadow stack) within the existing bits.
They chose to repurpose a lightly-used state: Write=0, Dirty=1.

The reason it's lightly used is that Dirty=1 is normally set by hardware
and cannot normally be set by hardware on a Write=0 PTE.  Software must
normally be involved to create one of these PTEs, so software can simply
opt to not create them.

In places where Linux normally creates Write=0, Dirty=1, it can use the
software-defined _PAGE_COW in place of the hardware _PAGE_DIRTY.  In other
words, whenever Linux needs to create Write=0, Dirty=1, it instead creates
Write=0, Cow=1, except for shadow stack, which is Write=0, Dirty=1.  This
clearly separates shadow stack from other data, and results in the
following:

(a) A modified, copy-on-write (COW) page: (Write=0, Cow=1)
(b) A R/O page that has been COW'ed: (Write=0, Cow=1)
The user page is in a R/O VMA, and get_user_pages() needs a writable
copy.  The page fault handler creates a copy of the page and sets
the new copy's PTE as Write=0 and Cow=1.
(c) A shadow stack PTE: (Write=0, Dirty=1)
(d) A shared shadow stack PTE: (Write=0, Cow=1)
When a shadow stack page is being shared among processes (this happens
at fork()), its PTE is made Dirty=0, so the next shadow stack access
causes a fault, and the page is duplicated and Dirty=1 is set again.
This is the COW equivalent for shadow stack pages, even though it's
copy-on-access rather than copy-on-write.
(e) A page where the processor observed a Write=1 PTE, started a write, set
Dirty=1, but then observed a Write=0 PTE.  That's possible today, but
will not happen on processors that support shadow stack.

Define _PAGE_COW and update pte_*() helpers and apply the same changes to
pmd and pud.

After this, there are six free bits left in the 64-bit PTE, and no more
free bits in the 32-bit PTE (except for PAE) and Shadow Stack is not
implemented for the 32-bit kernel.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kirill A. Shutemov 
---
v24:
- Replace CONFIG_X86_CET with CONFIG_X86_SHADOW_STACK to reflect the Kconfig
  changes.

 arch/x86/include/asm/pgtable.h   | 195 ---
 arch/x86/include/asm/pgtable_types.h |  42 +-
 2 files changed, 216 insertions(+), 21 deletions(-)

diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index c1650d0af1b5..9c056d5815de 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -121,11 +121,21 @@ extern pmdval_t early_pmd_flags;
  * The following only work if pte_present() is true.
  * Undefined behaviour if not..
  */
-static inline int pte_dirty(pte_t pte)
+static inline bool pte_dirty(pte_t pte)
 {
-   return pte_flags(pte) & _PAGE_DIRTY;
+   /*
+* A dirty PTE has Dirty=1 or Cow=1.
+*/
+   return pte_flags(pte) & _PAGE_DIRTY_BITS;
 }
 
+static inline bool pte_shstk(pte_t pte)
+{
+   if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+   return false;
+
+   return (pte_flags(pte) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
+}
 
 static inline u32 read_pkru(void)
 {
@@ -160,9 +170,20 @@ static inline int pte_young(pte_t pte)
return pte_flags(pte) & _PAGE_ACCESSED;
 }
 
-static inline int pmd_dirty(pmd_t pmd)
+static inline bool pmd_dirty(pmd_t pmd)
+{
+   /*
+* A dirty PMD has Dirty=1 or Cow=1.
+*/
+   return pmd_flags(pmd) & _PAGE_DIRTY_BITS;
+}
+
+static inline bool pmd_shstk(pmd_t pmd)
 {
-   return pmd_flags(pmd) & _PAGE_DIRTY;
+   if (!cpu_feature_enabled(X86_FEATURE_SHSTK))
+   return false;
+
+   return (pmd_flags(pmd) & (_PAGE_RW | _PAGE_DIRTY)) == _PAGE_DIRTY;
 }
 
 static inline int pmd_young(pmd_t pmd)
@@ -170,9 +191,12 @@ static inline int pmd_young(pmd_t pmd)
return pmd_flags(pmd) & _PAGE_ACCESSED;
 }
 
-static inline int pud_dirty(pud_t pud)
+static inline bool pud_dirty(pud_t pud)
 {
-   return pud_flags(pud) & _PAGE_DIRTY;
+   /*
+* A dirty PUD has Dirty=1 or Cow=1.
+*/
+   return pud_flags(pud) & _PAGE_DIRTY_BITS;
 }
 
 static inline int pud_young(pud_t pud)
@@ -182,13 +206,23 @@ static inline int pud_young(pud_t pud)
 
 static inline int pte_write(pte_t pte)
 {
-   return pte_flags(pte) & _PAGE_RW;
+   /*
+* Shadow stack pages are always writable - but not by normal
+* instructions, and only by shadow stack operations.  Therefore,
+* the W=0,D=1 test with pte_shstk().
+*/
+   return (pte_flags(pte) & _PAGE_RW) || pte_shstk(pte);
 }
 
 #define pmd_write pmd_write
 static inline int pmd_write(pmd_t pmd)
 {
-   return pmd_flags(pmd) & _PAGE_RW;
+   /*
+* Shadow stack pages are always writable - but not by no

[PATCH v24 07/30] x86/mm: Remove _PAGE_DIRTY from kernel RO pages

2021-04-01 Thread Yu-cheng Yu
The x86 family of processors do not directly create read-only and Dirty
PTEs.  These PTEs are created by software.  One such case is that kernel
read-only pages are historically setup as Dirty.

New processors that support Shadow Stack regard read-only and Dirty PTEs as
shadow stack pages.  This results in ambiguity between shadow stack and
kernel read-only pages.  To resolve this, removed Dirty from kernel read-
only pages.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kirill A. Shutemov 
Cc: "H. Peter Anvin" 
Cc: Kees Cook 
Cc: Thomas Gleixner 
Cc: Dave Hansen 
Cc: Christoph Hellwig 
Cc: Andy Lutomirski 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: Peter Zijlstra 
---
 arch/x86/include/asm/pgtable_types.h | 6 +++---
 arch/x86/mm/pat/set_memory.c | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/pgtable_types.h 
b/arch/x86/include/asm/pgtable_types.h
index f24d7ef8fffa..9db61817dfff 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -192,10 +192,10 @@ enum page_cache_mode {
 #define _KERNPG_TABLE   (__PP|__RW|   0|___A|   0|___D|   0|   0| _ENC)
 #define _PAGE_TABLE_NOENC   (__PP|__RW|_USR|___A|   0|___D|   0|   0)
 #define _PAGE_TABLE (__PP|__RW|_USR|___A|   0|___D|   0|   0| _ENC)
-#define __PAGE_KERNEL_RO(__PP|   0|   0|___A|__NX|___D|   0|___G)
-#define __PAGE_KERNEL_ROX   (__PP|   0|   0|___A|   0|___D|   0|___G)
+#define __PAGE_KERNEL_RO(__PP|   0|   0|___A|__NX|   0|   0|___G)
+#define __PAGE_KERNEL_ROX   (__PP|   0|   0|___A|   0|   0|   0|___G)
 #define __PAGE_KERNEL_NOCACHE   (__PP|__RW|   0|___A|__NX|___D|   0|___G| __NC)
-#define __PAGE_KERNEL_VVAR  (__PP|   0|_USR|___A|__NX|___D|   0|___G)
+#define __PAGE_KERNEL_VVAR  (__PP|   0|_USR|___A|__NX|   0|   0|___G)
 #define __PAGE_KERNEL_LARGE (__PP|__RW|   0|___A|__NX|___D|_PSE|___G)
 #define __PAGE_KERNEL_LARGE_EXEC (__PP|__RW|   0|___A|   0|___D|_PSE|___G)
 #define __PAGE_KERNEL_WP(__PP|__RW|   0|___A|__NX|___D|   0|___G| __WP)
diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..6bebb95a6988 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -1932,7 +1932,7 @@ int set_memory_nx(unsigned long addr, int numpages)
 
 int set_memory_ro(unsigned long addr, int numpages)
 {
-   return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW), 0);
+   return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_RW | 
_PAGE_DIRTY), 0);
 }
 
 int set_memory_rw(unsigned long addr, int numpages)
-- 
2.21.0



[PATCH v24 05/30] x86/fpu/xstate: Introduce CET MSR and XSAVES supervisor states

2021-04-01 Thread Yu-cheng Yu
Control-flow Enforcement Technology (CET) introduces these MSRs:

MSR_IA32_U_CET (user-mode CET settings),
MSR_IA32_PL3_SSP (user-mode shadow stack pointer),

MSR_IA32_PL0_SSP (kernel-mode shadow stack pointer),
MSR_IA32_PL1_SSP (Privilege Level 1 shadow stack pointer),
MSR_IA32_PL2_SSP (Privilege Level 2 shadow stack pointer),
MSR_IA32_S_CET (kernel-mode CET settings),
MSR_IA32_INT_SSP_TAB (exception shadow stack table).

The two user-mode MSRs belong to XFEATURE_CET_USER.  The first three of
kernel-mode MSRs belong to XFEATURE_CET_KERNEL.  Both XSAVES states are
supervisor states.  This means that there is no direct, unprivileged access
to these states, making it harder for an attacker to subvert CET.

For sigreturn and future ptrace() support, shadow stack address and MSR
reserved bits are checked before written to the supervisor states.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 arch/x86/include/asm/fpu/types.h  | 23 +--
 arch/x86/include/asm/fpu/xstate.h |  6 --
 arch/x86/include/asm/msr-index.h  | 19 +++
 arch/x86/kernel/fpu/xstate.c  | 10 +-
 4 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h
index f5a38a5f3ae1..035eb0ec665e 100644
--- a/arch/x86/include/asm/fpu/types.h
+++ b/arch/x86/include/asm/fpu/types.h
@@ -115,8 +115,8 @@ enum xfeature {
XFEATURE_PT_UNIMPLEMENTED_SO_FAR,
XFEATURE_PKRU,
XFEATURE_PASID,
-   XFEATURE_RSRVD_COMP_11,
-   XFEATURE_RSRVD_COMP_12,
+   XFEATURE_CET_USER,
+   XFEATURE_CET_KERNEL,
XFEATURE_RSRVD_COMP_13,
XFEATURE_RSRVD_COMP_14,
XFEATURE_LBR,
@@ -135,6 +135,8 @@ enum xfeature {
 #define XFEATURE_MASK_PT   (1 << XFEATURE_PT_UNIMPLEMENTED_SO_FAR)
 #define XFEATURE_MASK_PKRU (1 << XFEATURE_PKRU)
 #define XFEATURE_MASK_PASID(1 << XFEATURE_PASID)
+#define XFEATURE_MASK_CET_USER (1 << XFEATURE_CET_USER)
+#define XFEATURE_MASK_CET_KERNEL   (1 << XFEATURE_CET_KERNEL)
 #define XFEATURE_MASK_LBR  (1 << XFEATURE_LBR)
 
 #define XFEATURE_MASK_FPSSE(XFEATURE_MASK_FP | XFEATURE_MASK_SSE)
@@ -237,6 +239,23 @@ struct pkru_state {
u32 pad;
 } __packed;
 
+/*
+ * State component 11 is Control-flow Enforcement user states
+ */
+struct cet_user_state {
+   u64 user_cet;   /* user control-flow settings */
+   u64 user_ssp;   /* user shadow stack pointer */
+};
+
+/*
+ * State component 12 is Control-flow Enforcement kernel states
+ */
+struct cet_kernel_state {
+   u64 kernel_ssp; /* kernel shadow stack */
+   u64 pl1_ssp;/* privilege level 1 shadow stack */
+   u64 pl2_ssp;/* privilege level 2 shadow stack */
+};
+
 /*
  * State component 15: Architectural LBR configuration state.
  * The size of Arch LBR state depends on the number of LBRs (lbr_depth).
diff --git a/arch/x86/include/asm/fpu/xstate.h 
b/arch/x86/include/asm/fpu/xstate.h
index 47a92232d595..582f3575e0bd 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -35,7 +35,8 @@
  XFEATURE_MASK_BNDCSR)
 
 /* All currently supported supervisor features */
-#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID)
+#define XFEATURE_MASK_SUPERVISOR_SUPPORTED (XFEATURE_MASK_PASID | \
+   XFEATURE_MASK_CET_USER)
 
 /*
  * A supervisor state component may not always contain valuable information,
@@ -62,7 +63,8 @@
  * Unsupported supervisor features. When a supervisor feature in this mask is
  * supported in the future, move it to the supported supervisor feature mask.
  */
-#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT)
+#define XFEATURE_MASK_SUPERVISOR_UNSUPPORTED (XFEATURE_MASK_PT | \
+ XFEATURE_MASK_CET_KERNEL)
 
 /* All supervisor states including supported and unsupported states. */
 #define XFEATURE_MASK_SUPERVISOR_ALL (XFEATURE_MASK_SUPERVISOR_SUPPORTED | \
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 546d6ecf0a35..5f4b7edead0b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -933,4 +933,23 @@
 #define MSR_VM_IGNNE0xc0010115
 #define MSR_VM_HSAVE_PA 0xc0010117
 
+/* Control-flow Enforcement Technology MSRs */
+#define MSR_IA32_U_CET 0x06a0 /* user mode cet setting */
+#define MSR_IA32_S_CET 0x06a2 /* kernel mode cet setting */
+#define CET_SHSTK_EN   BIT_ULL(0)
+#define CET_WRSS_ENBIT_ULL(1)
+#define CET_ENDBR_EN   BIT_ULL(2)
+#define CET_LEG_IW_EN  BIT_ULL(3)
+#define CET_NO_TRACK_E

[PATCH v24 14/30] x86/mm: Shadow Stack page fault error checking

2021-04-01 Thread Yu-cheng Yu
Shadow stack accesses are those that are performed by the CPU where it
expects to encounter a shadow stack mapping.  These accesses are performed
implicitly by CALL/RET at the site of the shadow stack pointer.  These
accesses are made explicitly by shadow stack management instructions like
WRUSSQ.

Shadow stacks accesses to shadow-stack mapping can see faults in normal,
valid operation just like regular accesses to regular mappings.  Shadow
stacks need some of the same features like delayed allocation, swap and
copy-on-write.

Shadow stack accesses can also result in errors, such as when a shadow
stack overflows, or if a shadow stack access occurs to a non-shadow-stack
mapping.

In handling a shadow stack page fault, verify it occurs within a shadow
stack mapping.  It is always an error otherwise.  For valid shadow stack
accesses, set FAULT_FLAG_WRITE to effect copy-on-write.  Because clearing
_PAGE_DIRTY (vs. _PAGE_RW) is used to trigger the fault, shadow stack read
fault and shadow stack write fault are not differentiated and both are
handled as a write access.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Cc: Kirill A. Shutemov 
---
v24:
- Change VM_SHSTK to VM_SHADOW_STACK.

 arch/x86/include/asm/trap_pf.h |  2 ++
 arch/x86/mm/fault.c| 19 +++
 2 files changed, 21 insertions(+)

diff --git a/arch/x86/include/asm/trap_pf.h b/arch/x86/include/asm/trap_pf.h
index 10b1de500ab1..afa524325e55 100644
--- a/arch/x86/include/asm/trap_pf.h
+++ b/arch/x86/include/asm/trap_pf.h
@@ -11,6 +11,7 @@
  *   bit 3 ==  1: use of reserved bit detected
  *   bit 4 ==  1: fault was an instruction fetch
  *   bit 5 ==  1: protection keys block access
+ *   bit 6 ==  1: shadow stack access fault
  *   bit 15 == 1: SGX MMU page-fault
  */
 enum x86_pf_error_code {
@@ -20,6 +21,7 @@ enum x86_pf_error_code {
X86_PF_RSVD =   1 << 3,
X86_PF_INSTR=   1 << 4,
X86_PF_PK   =   1 << 5,
+   X86_PF_SHSTK=   1 << 6,
X86_PF_SGX  =   1 << 15,
 };
 
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a73347e2cdfc..394e504305b7 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1100,6 +1100,17 @@ access_error(unsigned long error_code, struct 
vm_area_struct *vma)
   (error_code & X86_PF_INSTR), foreign))
return 1;
 
+   /*
+* Verify a shadow stack access is within a shadow stack VMA.
+* It is always an error otherwise.  Normal data access to a
+* shadow stack area is checked in the case followed.
+*/
+   if (error_code & X86_PF_SHSTK) {
+   if (!(vma->vm_flags & VM_SHADOW_STACK))
+   return 1;
+   return 0;
+   }
+
if (error_code & X86_PF_WRITE) {
/* write, present and write, not present: */
if (unlikely(!(vma->vm_flags & VM_WRITE)))
@@ -1293,6 +1304,14 @@ void do_user_addr_fault(struct pt_regs *regs,
 
perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
+   /*
+* Clearing _PAGE_DIRTY is used to detect shadow stack access.
+* This method cannot distinguish shadow stack read vs. write.
+* For valid shadow stack accesses, set FAULT_FLAG_WRITE to effect
+* copy-on-write.
+*/
+   if (error_code & X86_PF_SHSTK)
+   flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_WRITE)
flags |= FAULT_FLAG_WRITE;
if (error_code & X86_PF_INSTR)
-- 
2.21.0



[PATCH v24 06/30] x86/cet: Add control-protection fault handler

2021-04-01 Thread Yu-cheng Yu
A control-protection fault is triggered when a control-flow transfer
attempt violates Shadow Stack or Indirect Branch Tracking constraints.
For example, the return address for a RET instruction differs from the copy
on the shadow stack; or an indirect JMP instruction, without the NOTRACK
prefix, arrives at a non-ENDBR opcode.

The control-protection fault handler works in a similar way as the general
protection fault handler.  It provides the si_code SEGV_CPERR to the signal
handler.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
Cc: Michael Kerrisk 
---
 arch/x86/include/asm/idtentry.h|  4 ++
 arch/x86/kernel/idt.c  |  4 ++
 arch/x86/kernel/signal_compat.c|  2 +-
 arch/x86/kernel/traps.c| 63 ++
 include/uapi/asm-generic/siginfo.h |  3 +-
 5 files changed, 74 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h
index 5eb3bdf36a41..fa98ca6a17a2 100644
--- a/arch/x86/include/asm/idtentry.h
+++ b/arch/x86/include/asm/idtentry.h
@@ -571,6 +571,10 @@ DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_SS,
exc_stack_segment);
 DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_GP,exc_general_protection);
 DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_AC,exc_alignment_check);
 
+#ifdef CONFIG_X86_CET
+DECLARE_IDTENTRY_ERRORCODE(X86_TRAP_CP, exc_control_protection);
+#endif
+
 /* Raw exception entries which need extra work */
 DECLARE_IDTENTRY_RAW(X86_TRAP_UD,  exc_invalid_op);
 DECLARE_IDTENTRY_RAW(X86_TRAP_BP,  exc_int3);
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index ee1a283f8e96..e8166d9bbb10 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -105,6 +105,10 @@ static const __initconst struct idt_data def_idts[] = {
 #elif defined(CONFIG_X86_32)
SYSG(IA32_SYSCALL_VECTOR,   entry_INT80_32),
 #endif
+
+#ifdef CONFIG_X86_CET
+   INTG(X86_TRAP_CP,   asm_exc_control_protection),
+#endif
 };
 
 /*
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
index a5330ff498f0..dd92490b1e7f 100644
--- a/arch/x86/kernel/signal_compat.c
+++ b/arch/x86/kernel/signal_compat.c
@@ -27,7 +27,7 @@ static inline void signal_compat_build_tests(void)
 */
BUILD_BUG_ON(NSIGILL  != 11);
BUILD_BUG_ON(NSIGFPE  != 15);
-   BUILD_BUG_ON(NSIGSEGV != 9);
+   BUILD_BUG_ON(NSIGSEGV != 10);
BUILD_BUG_ON(NSIGBUS  != 5);
BUILD_BUG_ON(NSIGTRAP != 5);
BUILD_BUG_ON(NSIGCHLD != 6);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index ac1874a2a70e..ee9c88e4e1bb 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -39,6 +39,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -606,6 +607,68 @@ DEFINE_IDTENTRY_ERRORCODE(exc_general_protection)
cond_local_irq_disable(regs);
 }
 
+#ifdef CONFIG_X86_CET
+static const char * const control_protection_err[] = {
+   "unknown",
+   "near-ret",
+   "far-ret/iret",
+   "endbranch",
+   "rstorssp",
+   "setssbsy",
+   "unknown",
+};
+
+static DEFINE_RATELIMIT_STATE(cpf_rate, DEFAULT_RATELIMIT_INTERVAL,
+ DEFAULT_RATELIMIT_BURST);
+
+/*
+ * When a control protection exception occurs, send a signal to the responsible
+ * application.  Currently, control protection is only enabled for user mode.
+ * This exception should not come from kernel mode.
+ */
+DEFINE_IDTENTRY_ERRORCODE(exc_control_protection)
+{
+   struct task_struct *tsk;
+
+   if (!user_mode(regs)) {
+   pr_emerg("PANIC: unexpected kernel control protection fault\n");
+   die("kernel control protection fault", regs, error_code);
+   panic("Machine halted.");
+   }
+
+   cond_local_irq_enable(regs);
+
+   if (!boot_cpu_has(X86_FEATURE_CET))
+   WARN_ONCE(1, "Control protection fault with CET support 
disabled\n");
+
+   tsk = current;
+   tsk->thread.error_code = error_code;
+   tsk->thread.trap_nr = X86_TRAP_CP;
+
+   /*
+* Ratelimit to prevent log spamming.
+*/
+   if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) &&
+   __ratelimit(&cpf_rate)) {
+   unsigned long ssp;
+   int cpf_type;
+
+   cpf_type = array_index_nospec(error_code, 
ARRAY_SIZE(control_protection_err));
+
+   rdmsrl(MSR_IA32_PL3_SSP, ssp);
+   pr_emerg("%s[%d] control protection ip:%lx sp:%lx ssp:%lx 
error:%lx(%s)",
+tsk->comm, task_pid_nr(tsk),
+regs->ip, regs->sp, ssp, error_code,
+control_protection_err[cpf_type]);
+   print_vma_addr(KERN_CONT " in ", regs->ip);
+   p

[PATCH v24 03/30] x86/cpufeatures: Add CET CPU feature flags for Control-flow Enforcement Technology (CET)

2021-04-01 Thread Yu-cheng Yu
Add CPU feature flags for Control-flow Enforcement Technology (CET).

CPUID.(EAX=7,ECX=0):ECX[bit 7] Shadow stack
CPUID.(EAX=7,ECX=0):EDX[bit 20] Indirect Branch Tracking

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Update for splitting CONFIG_X86_CET to CONFIG_X86_SHADOW_STACK and 
CONFIG_X86_IBT.
- Move DISABLE_IBT definition to the IBT series.

 arch/x86/include/asm/cpufeatures.h   | 2 ++
 arch/x86/include/asm/disabled-features.h | 8 +++-
 arch/x86/kernel/cpu/cpuid-deps.c | 2 ++
 3 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index cc96e26d69f7..bf861fc89fef 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -345,6 +345,7 @@
 #define X86_FEATURE_OSPKE  (16*32+ 4) /* OS Protection Keys Enable 
*/
 #define X86_FEATURE_WAITPKG(16*32+ 5) /* UMONITOR/UMWAIT/TPAUSE 
Instructions */
 #define X86_FEATURE_AVX512_VBMI2   (16*32+ 6) /* Additional AVX512 Vector 
Bit Manipulation Instructions */
+#define X86_FEATURE_SHSTK  (16*32+ 7) /* Shadow Stack */
 #define X86_FEATURE_GFNI   (16*32+ 8) /* Galois Field New 
Instructions */
 #define X86_FEATURE_VAES   (16*32+ 9) /* Vector AES */
 #define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication 
Double Quadword */
@@ -377,6 +378,7 @@
 #define X86_FEATURE_TSXLDTRK   (18*32+16) /* TSX Suspend Load Address 
Tracking */
 #define X86_FEATURE_PCONFIG(18*32+18) /* Intel PCONFIG */
 #define X86_FEATURE_ARCH_LBR   (18*32+19) /* Intel ARCH LBR */
+#define X86_FEATURE_IBT(18*32+20) /* Indirect Branch 
Tracking */
 #define X86_FEATURE_AVX512_FP16(18*32+23) /* AVX512 FP16 */
 #define X86_FEATURE_SPEC_CTRL  (18*32+26) /* "" Speculation Control 
(IBRS + IBPB) */
 #define X86_FEATURE_INTEL_STIBP(18*32+27) /* "" Single Thread 
Indirect Branch Predictors */
diff --git a/arch/x86/include/asm/disabled-features.h 
b/arch/x86/include/asm/disabled-features.h
index b7dd944dc867..e5c6ed9373e8 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -68,6 +68,12 @@
 # define DISABLE_SGX   (1 << (X86_FEATURE_SGX & 31))
 #endif
 
+#ifdef CONFIG_X86_SHADOW_STACK
+#define DISABLE_SHSTK  0
+#else
+#define DISABLE_SHSTK  (1 << (X86_FEATURE_SHSTK & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
@@ -88,7 +94,7 @@
 #define DISABLED_MASK140
 #define DISABLED_MASK150
 #define DISABLED_MASK16
(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \
-DISABLE_ENQCMD)
+DISABLE_ENQCMD|DISABLE_SHSTK)
 #define DISABLED_MASK170
 #define DISABLED_MASK180
 #define DISABLED_MASK190
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
index 42af31b64c2c..52d9a682a0e6 100644
--- a/arch/x86/kernel/cpu/cpuid-deps.c
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -72,6 +72,8 @@ static const struct cpuid_dep cpuid_deps[] = {
{ X86_FEATURE_AVX512_FP16,  X86_FEATURE_AVX512BW  },
{ X86_FEATURE_ENQCMD,   X86_FEATURE_XSAVES},
{ X86_FEATURE_PER_THREAD_MBA,   X86_FEATURE_MBA   },
+   { X86_FEATURE_SHSTK,X86_FEATURE_XSAVES},
+   { X86_FEATURE_IBT,  X86_FEATURE_XSAVES},
{}
 };
 
-- 
2.21.0



[PATCH v24 02/30] x86/cet/shstk: Add Kconfig option for Shadow Stack

2021-04-01 Thread Yu-cheng Yu
Shadow Stack provides protection against function return address
corruption.  It is active when the processor supports it, the kernel has
CONFIG_X86_SHADOW_STACK enabled, and the application is built for the
feature.  This is only implemented for the 64-bit kernel.  When it is
enabled, legacy non-Shadow Stack applications continue to work, but without
protection.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Update for the splitting X86_CET to X86_SHADOW_STACK and X86_IBT.

 arch/x86/Kconfig   | 26 ++
 arch/x86/Kconfig.assembler |  5 +
 2 files changed, 31 insertions(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2792879d398e..f42560b220ef 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -28,6 +28,7 @@ config X86_64
select ARCH_HAS_GIGANTIC_PAGE
select ARCH_SUPPORTS_INT128 if CC_HAS_INT128
select ARCH_USE_CMPXCHG_LOCKREF
+   select ARCH_HAS_SHADOW_STACK
select HAVE_ARCH_SOFT_DIRTY
select MODULES_USE_ELF_RELA
select NEED_DMA_MAP_STATE
@@ -1941,6 +1942,31 @@ config X86_SGX
 
  If unsure, say N.
 
+config ARCH_HAS_SHADOW_STACK
+   def_bool n
+
+config X86_CET
+   def_bool n
+
+config X86_SHADOW_STACK
+   prompt "Intel Shadow Stack"
+   def_bool n
+   depends on AS_WRUSS
+   depends on ARCH_HAS_SHADOW_STACK
+   select ARCH_USES_HIGH_VMA_FLAGS
+   select X86_CET
+   help
+ Shadow Stack protection is a hardware feature that detects function
+ return address corruption.  This helps mitigate ROP attacks.
+ Applications must be enabled to use it, and old userspace does not
+ get protection "for free".
+ Support for this feature is present on Tiger Lake family of
+ processors released in 2020 or later.  Enabling this feature
+ increases kernel text size by 3.7 KB.
+ See Documentation/x86/intel_cet.rst for more information.
+
+ If unsure, say N.
+
 config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/Kconfig.assembler b/arch/x86/Kconfig.assembler
index 26b8c08e2fc4..00c79dd93651 100644
--- a/arch/x86/Kconfig.assembler
+++ b/arch/x86/Kconfig.assembler
@@ -19,3 +19,8 @@ config AS_TPAUSE
def_bool $(as-instr,tpause %ecx)
help
  Supported by binutils >= 2.31.1 and LLVM integrated assembler >= V7
+
+config AS_WRUSS
+   def_bool $(as-instr,wrussq %rax$(comma)(%rbx))
+   help
+ Supported by binutils >= 2.31 and LLVM integrated assembler
-- 
2.21.0



[PATCH v24 04/30] x86/cpufeatures: Introduce X86_FEATURE_CET and setup functions

2021-04-01 Thread Yu-cheng Yu
Introduce a software-defined X86_FEATURE_CET, which indicates either Shadow
Stack or Indirect Branch Tracking (or both) is present.  Also introduce
related cpu init/setup functions.

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Update #ifdef placement to reflect Kconfig changes of splitting shadow stack 
and ibt.

 arch/x86/include/asm/cpufeatures.h  |  2 +-
 arch/x86/include/asm/disabled-features.h|  9 -
 arch/x86/include/uapi/asm/processor-flags.h |  2 ++
 arch/x86/kernel/cpu/common.c| 14 ++
 arch/x86/kernel/cpu/intel.c |  3 +++
 5 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index bf861fc89fef..d771e62677de 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -108,7 +108,7 @@
 #define X86_FEATURE_EXTD_APICID( 3*32+26) /* Extended APICID 
(8 bits) */
 #define X86_FEATURE_AMD_DCM( 3*32+27) /* AMD multi-node processor 
*/
 #define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware 
coordination feedback capability (APERF/MPERF MSRs) */
-/* free( 3*32+29) */
+#define X86_FEATURE_CET( 3*32+29) /* Control-flow 
enforcement */
 #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 
state */
 #define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
 
diff --git a/arch/x86/include/asm/disabled-features.h 
b/arch/x86/include/asm/disabled-features.h
index e5c6ed9373e8..018cd7acd3e9 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -74,13 +74,20 @@
 #define DISABLE_SHSTK  (1 << (X86_FEATURE_SHSTK & 31))
 #endif
 
+#ifdef CONFIG_X86_CET
+#define DISABLE_CET0
+#else
+#define DISABLE_CET(1 << (X86_FEATURE_CET & 31))
+#endif
+
 /*
  * Make sure to add features to the correct mask
  */
 #define DISABLED_MASK0 (DISABLE_VME)
 #define DISABLED_MASK1 0
 #define DISABLED_MASK2 0
-#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
+#define DISABLED_MASK3 (DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR| 
\
+DISABLE_CET)
 #define DISABLED_MASK4 (DISABLE_PCID)
 #define DISABLED_MASK5 0
 #define DISABLED_MASK6 0
diff --git a/arch/x86/include/uapi/asm/processor-flags.h 
b/arch/x86/include/uapi/asm/processor-flags.h
index bcba3c643e63..a8df907e8017 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -130,6 +130,8 @@
 #define X86_CR4_SMAP   _BITUL(X86_CR4_SMAP_BIT)
 #define X86_CR4_PKE_BIT22 /* enable Protection Keys support */
 #define X86_CR4_PKE_BITUL(X86_CR4_PKE_BIT)
+#define X86_CR4_CET_BIT23 /* enable Control-flow Enforcement */
+#define X86_CR4_CET_BITUL(X86_CR4_CET_BIT)
 
 /*
  * x86-64 Task Priority Register, CR8
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index ab640abe26b6..6bd07727089e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -510,6 +510,14 @@ static __init int setup_disable_pku(char *arg)
 __setup("nopku", setup_disable_pku);
 #endif /* CONFIG_X86_64 */
 
+static __always_inline void setup_cet(struct cpuinfo_x86 *c)
+{
+   if (!cpu_feature_enabled(X86_FEATURE_CET))
+   return;
+
+   cr4_set_bits(X86_CR4_CET);
+}
+
 /*
  * Some CPU features depend on higher CPUID levels, which may not always
  * be available due to CPUID level capping or broken virtualization
@@ -1255,6 +1263,11 @@ static void __init cpu_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
 
+   if (cmdline_find_option_bool(boot_command_line, "no_user_shstk"))
+   setup_clear_cpu_cap(X86_FEATURE_SHSTK);
+   if (cmdline_find_option_bool(boot_command_line, "no_user_ibt"))
+   setup_clear_cpu_cap(X86_FEATURE_IBT);
+
arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, 
sizeof(arg));
if (arglen <= 0)
return;
@@ -1594,6 +1607,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 
x86_init_rdrand(c);
setup_pku(c);
+   setup_cet(c);
 
/*
 * Clear/Set all flags overridden by options, need do it
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 0e422a544835..2e11d9555e9b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -334,6 +334,9 @@ static void early_init_intel(struct cpuinfo_x86 *c)
 
 static void bsp_init_intel(struct cpuinfo_x86 *c)
 {
+   if (cpu_has(c, X86_FEATURE_SHSTK) || cpu_has(c, X86_FEATURE_IBT))
+   setup_force_cpu_cap(X86_FEATURE_CET);
+
resctrl_cpu_detect(c);
 }
 
-- 
2.21.0



[PATCH v24 01/30] Documentation/x86: Add CET description

2021-04-01 Thread Yu-cheng Yu
Explain no_user_shstk/no_user_ibt kernel parameters, and introduce a new
document on Control-flow Enforcement Technology (CET).

Signed-off-by: Yu-cheng Yu 
Cc: Kees Cook 
---
v24:
- Update for Kconfig changes from X86_CET to X86_SHADOW_STACK, X86_IBT.
- Update for the change of VM_SHSTK to VM_SHADOW_STACK.

 .../admin-guide/kernel-parameters.txt |   6 +
 Documentation/x86/index.rst   |   1 +
 Documentation/x86/intel_cet.rst   | 136 ++
 3 files changed, 143 insertions(+)
 create mode 100644 Documentation/x86/intel_cet.rst

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 04545725f187..bc79e54be91e 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -3220,6 +3220,12 @@
noexec=on: enable non-executable mappings (default)
noexec=off: disable non-executable mappings
 
+   no_user_shstk   [X86-64] Disable Shadow Stack for user-mode
+   applications
+
+   no_user_ibt [X86-64] Disable Indirect Branch Tracking for user-mode
+   applications
+
nosmap  [X86,PPC]
Disable SMAP (Supervisor Mode Access Prevention)
even if it is supported by processor.
diff --git a/Documentation/x86/index.rst b/Documentation/x86/index.rst
index 4693e192b447..cf5250a3cc70 100644
--- a/Documentation/x86/index.rst
+++ b/Documentation/x86/index.rst
@@ -21,6 +21,7 @@ x86-specific Documentation
tlb
mtrr
pat
+   intel_cet
intel-iommu
intel_txt
amd-memory-encryption
diff --git a/Documentation/x86/intel_cet.rst b/Documentation/x86/intel_cet.rst
new file mode 100644
index ..ae30c392994a
--- /dev/null
+++ b/Documentation/x86/intel_cet.rst
@@ -0,0 +1,136 @@
+.. SPDX-License-Identifier: GPL-2.0
+
+=
+Control-flow Enforcement Technology (CET)
+=
+
+[1] Overview
+
+
+Control-flow Enforcement Technology (CET) is an Intel processor feature
+that provides protection against return/jump-oriented programming (ROP)
+attacks.  It can be set up to protect both applications and the kernel.
+Only user-mode protection is implemented in the 64-bit kernel, including
+support for running legacy 32-bit applications.
+
+CET introduces Shadow Stack and Indirect Branch Tracking.  Shadow stack is
+a secondary stack allocated from memory and cannot be directly modified by
+applications.  When executing a CALL instruction, the processor pushes the
+return address to both the normal stack and the shadow stack.  Upon
+function return, the processor pops the shadow stack copy and compares it
+to the normal stack copy.  If the two differ, the processor raises a
+control-protection fault.  Indirect branch tracking verifies indirect
+CALL/JMP targets are intended as marked by the compiler with 'ENDBR'
+opcodes.
+
+There are two Kconfig options:
+
+X86_SHADOW_STACK, and X86_IBT.
+
+To build a CET-enabled kernel, Binutils v2.31 and GCC v8.1 or LLVM v10.0.1
+or later are required.  To build a CET-enabled application, GLIBC v2.28 or
+later is also required.
+
+There are two command-line options for disabling CET features::
+
+no_user_shstk - disables user shadow stack, and
+no_user_ibt   - disables user indirect branch tracking.
+
+At run time, /proc/cpuinfo shows CET features if the processor supports
+CET.
+
+[2] Application Enabling
+
+
+An application's CET capability is marked in its ELF header and can be
+verified from readelf/llvm-readelf output:
+
+readelf -n  | grep -a SHSTK
+properties: x86 feature: IBT, SHSTK
+
+If an application supports CET and is statically linked, it will run with
+CET protection.  If the application needs any shared libraries, the loader
+checks all dependencies and enables CET when all requirements are met.
+
+[3] Backward Compatibility
+==
+
+GLIBC provides a few CET tunables via the GLIBC_TUNABLES environment
+variable:
+
+GLIBC_TUNABLES=glibc.tune.hwcaps=-SHSTK,-IBT
+Turn off SHSTK/IBT.
+
+GLIBC_TUNABLES=glibc.tune.x86_shstk=
+This controls how dlopen() handles SHSTK legacy libraries::
+
+on - continue with SHSTK enabled;
+permissive - continue with SHSTK off.
+
+Details can be found in the GLIBC manual pages.
+
+[4] CET arch_prctl()'s
+==
+
+Several arch_prctl()'s have been added for CET:
+
+arch_prctl(ARCH_X86_CET_STATUS, u64 *addr)
+Return CET feature status.
+
+The parameter 'addr' is a pointer to a user buffer.
+On returning to the caller, the kernel fills the following
+information::
+
+*addr   = shadow stack/indirect branch tracking status
+*(addr + 1) = shadow stack base ad

[PATCH v24 00/30] Control-flow Enforcement: Shadow Stack

2021-04-01 Thread Yu-cheng Yu
Control-flow Enforcement (CET) is a new Intel processor feature that blocks
return/jump-oriented programming attacks.  Details are in "Intel 64 and
IA-32 Architectures Software Developer's Manual" [1].

CET can protect applications and the kernel.  This series enables only
application-level protection, and has three parts:

  - Shadow stack [2],
  - Indirect branch tracking [3], and
  - Selftests [4].

I have run tests on these patches for quite some time, and they have been
very stable.  Linux distributions with CET are available now, and Intel
processors with CET are already on the market.  It would be nice if CET
support can be accepted into the kernel.  I will be working to address any
issues should they come up.

Changes in v24:
- Split shadow stack and IBT into separate Kconfig options and source
  files, update related areas accordingly.  Specific changes are called out
  in each patch's commit log.
- Patch #15: Instead of arch_maybe_mkwrite(), create x86 versions of
  maybe*_mkwrite().
- Patch #17: Instead changing vm_*_gap(), create x86 versions.
- Patch #24, #25: Split signal handling into two patches, update comments/
  logs.
- Patch #29, #30: Update arch_validate_flags() and use that for checking
  PROT_SHSTK.
- Rebase to Linus tree v5.12-rc5.

[1] Intel 64 and IA-32 Architectures Software Developer's Manual:

https://software.intel.com/en-us/download/intel-64-and-ia-32-
architectures-sdm-combined-volumes-1-2a-2b-2c-2d-3a-3b-3c-3d-and-4

[2] CET Shadow Stack patches v23:

https://lore.kernel.org/r/20210316151054.5405-1-yu-cheng...@intel.com/

[3] Indirect Branch Tracking patches v23.

https://lore.kernel.org/r/20210316151320.6123-1-yu-cheng...@intel.com/

[4] I am holding off the selftests changes and working to get Reviewed-by's.
The earlier version of the selftests patches:

https://lkml.kernel.org/r/20200521211720.20236-1-yu-cheng...@intel.com/

[5] The kernel ptrace patch is tested with an Intel-internal updated GDB.
I am holding off the kernel ptrace patch to re-test it with my earlier
patch for fixing regset holes.

Yu-cheng Yu (30):
  Documentation/x86: Add CET description
  x86/cet/shstk: Add Kconfig option for Shadow Stack
  x86/cpufeatures: Add CET CPU feature flags for Control-flow
Enforcement Technology (CET)
  x86/cpufeatures: Introduce X86_FEATURE_CET and setup functions
  x86/fpu/xstate: Introduce CET MSR and XSAVES supervisor states
  x86/cet: Add control-protection fault handler
  x86/mm: Remove _PAGE_DIRTY from kernel RO pages
  x86/mm: Move pmd_write(), pud_write() up in the file
  x86/mm: Introduce _PAGE_COW
  drm/i915/gvt: Change _PAGE_DIRTY to _PAGE_DIRTY_BITS
  x86/mm: Update pte_modify for _PAGE_COW
  x86/mm: Update ptep_set_wrprotect() and pmdp_set_wrprotect() for
transition from _PAGE_DIRTY to _PAGE_COW
  mm: Introduce VM_SHADOW_STACK for shadow stack memory
  x86/mm: Shadow Stack page fault error checking
  x86/mm: Update maybe_mkwrite() for shadow stack
  mm: Fixup places that call pte_mkwrite() directly
  mm: Add guard pages around a shadow stack.
  mm/mmap: Add shadow stack pages to memory accounting
  mm: Update can_follow_write_pte() for shadow stack
  mm/mprotect: Exclude shadow stack from preserve_write
  mm: Re-introduce vm_flags to do_mmap()
  x86/cet/shstk: Add user-mode shadow stack support
  x86/cet/shstk: Handle thread shadow stack
  x86/cet/shstk: Introduce shadow stack token setup/verify routines
  x86/cet/shstk: Handle signals for shadow stack
  ELF: Introduce arch_setup_elf_property()
  x86/cet/shstk: Add arch_prctl functions for shadow stack
  mm: Move arch_calc_vm_prot_bits() to arch/x86/include/asm/mman.h
  mm: Update arch_validate_flags() to include vma anonymous
  mm: Introduce PROT_SHSTK for shadow stack

 .../admin-guide/kernel-parameters.txt |   6 +
 Documentation/filesystems/proc.rst|   1 +
 Documentation/x86/index.rst   |   1 +
 Documentation/x86/intel_cet.rst   | 136 
 arch/arm64/include/asm/elf.h  |   5 +
 arch/arm64/include/asm/mman.h |   4 +-
 arch/sparc/include/asm/mman.h |   4 +-
 arch/x86/Kconfig  |  28 ++
 arch/x86/Kconfig.assembler|   5 +
 arch/x86/ia32/ia32_signal.c   |  16 +
 arch/x86/include/asm/cet.h|  52 +++
 arch/x86/include/asm/cpufeatures.h|   4 +-
 arch/x86/include/asm/disabled-features.h  |  17 +-
 arch/x86/include/asm/elf.h|  13 +
 arch/x86/include/asm/fpu/internal.h   |   2 +
 arch/x86/include/asm/fpu/types.h  |  23 +-
 arch/x86/include/asm/fpu/xstate.h |   6 +-
 arch/x86/include/asm/idtentry.h   |   4 +
 arch/x86/include/asm/mman.h   |  87 +
 arch/x86/include/asm/mmu_context.h|   3 +
 arch/x86/include/asm/msr-index.h  |  19 ++
 arc

[PATCH v23 9/9] x86/vdso: Add ENDBR to __vdso_sgx_enter_enclave

2021-03-16 Thread Yu-cheng Yu
ENDBR is a special new instruction for the Indirect Branch Tracking (IBT)
component of CET.  IBT prevents attacks by ensuring that (most) indirect
branches and function calls may only land at ENDBR instructions.  Branches
that don't follow the rules will result in control flow (#CF) exceptions.

ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
instructions are inserted automatically by the compiler, but branch
targets written in assembly must have ENDBR added manually.

Add ENDBR to __vdso_sgx_enter_enclave() branch targets.

Signed-off-by: Yu-cheng Yu 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: Jarkko Sakkinen 
Cc: Peter Zijlstra 
---
 arch/x86/entry/vdso/vsgx.S | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/entry/vdso/vsgx.S b/arch/x86/entry/vdso/vsgx.S
index 86a0e94f68df..1baa9b49053e 100644
--- a/arch/x86/entry/vdso/vsgx.S
+++ b/arch/x86/entry/vdso/vsgx.S
@@ -6,6 +6,7 @@
 #include 
 
 #include "extable.h"
+#include "../calling.h"
 
 /* Relative to %rbp. */
 #define SGX_ENCLAVE_OFFSET_OF_RUN  16
@@ -27,6 +28,7 @@
 SYM_FUNC_START(__vdso_sgx_enter_enclave)
/* Prolog */
.cfi_startproc
+   ENDBR
push%rbp
.cfi_adjust_cfa_offset  8
.cfi_rel_offset %rbp, 0
@@ -62,6 +64,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
 .Lasync_exit_pointer:
 .Lenclu_eenter_eresume:
enclu
+   ENDBR
 
/* EEXIT jumps here unless the enclave is doing something fancy. */
mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
@@ -91,6 +94,7 @@ SYM_FUNC_START(__vdso_sgx_enter_enclave)
jmp .Lout
 
 .Lhandle_exception:
+   ENDBR
mov SGX_ENCLAVE_OFFSET_OF_RUN(%rbp), %rbx
 
/* Set the exception info. */
-- 
2.21.0



[PATCH v23 8/9] x86/vdso: Insert endbr32/endbr64 to vDSO

2021-03-16 Thread Yu-cheng Yu
From: "H.J. Lu" 

When Indirect Branch Tracking (IBT) is enabled, vDSO functions may be
called indirectly, and must have ENDBR32 or ENDBR64 as the first
instruction.  The compiler must support -fcf-protection=branch so that it
can be used to compile vDSO.

Signed-off-by: H.J. Lu 
Signed-off-by: Yu-cheng Yu 
Acked-by: Andy Lutomirski 
Reviewed-by: Kees Cook 
---
 arch/x86/entry/vdso/Makefile | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 05c4abc2fdfd..c9eccbc06e8c 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -93,6 +93,10 @@ endif
 
 $(vobjs): KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_LTO) $(GCC_PLUGINS_CFLAGS) 
$(RETPOLINE_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
 
+ifdef CONFIG_X86_CET
+$(vobjs) $(vobjs32): KBUILD_CFLAGS += -fcf-protection=branch
+endif
+
 #
 # vDSO code runs in userspace and -pg doesn't help with profiling anyway.
 #
-- 
2.21.0



[PATCH v23 7/9] x86/vdso/32: Add ENDBR to __kernel_vsyscall entry point

2021-03-16 Thread Yu-cheng Yu
From: "H.J. Lu" 

ENDBR is a special new instruction for the Indirect Branch Tracking (IBT)
component of CET.  IBT prevents attacks by ensuring that (most) indirect
branches and function calls may only land at ENDBR instructions.  Branches
that don't follow the rules will result in control flow (#CF) exceptions.

ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
instructions are inserted automatically by the compiler, but branch
targets written in assembly must have ENDBR added manually.

Add that to __kernel_vsyscall entry point.

Signed-off-by: H.J. Lu 
Signed-off-by: Yu-cheng Yu 
Cc: Andy Lutomirski 
Cc: Kees Cook 
---
 arch/x86/entry/vdso/vdso32/system_call.S | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/entry/vdso/vdso32/system_call.S 
b/arch/x86/entry/vdso/vdso32/system_call.S
index de1fff7188aa..adbe948c1a81 100644
--- a/arch/x86/entry/vdso/vdso32/system_call.S
+++ b/arch/x86/entry/vdso/vdso32/system_call.S
@@ -7,6 +7,7 @@
 #include 
 #include 
 #include 
+#include "../../calling.h"
 
.text
.globl __kernel_vsyscall
@@ -14,6 +15,7 @@
ALIGN
 __kernel_vsyscall:
CFI_STARTPROC
+   ENDBR
/*
 * Reshuffle regs so that all of any of the entry instructions
 * will preserve enough state.
-- 
2.21.0



[PATCH v23 6/9] x86/entry: Introduce ENDBR macro

2021-03-16 Thread Yu-cheng Yu
ENDBR is a special new instruction for the Indirect Branch Tracking (IBT)
component of CET.  IBT prevents attacks by ensuring that (most) indirect
branches and function calls may only land at ENDBR instructions.  Branches
that don't follow the rules will result in control flow (#CF) exceptions.

ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
instructions are inserted automatically by the compiler, but branch
targets written in assembly must have ENDBR added manually.

There are two ENDBR versions: one for 64-bit and the other for 32.
Introduce a macro to eliminate ifdeffery at call sites.

Signed-off-by: Yu-cheng Yu 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: Jarkko Sakkinen 
Cc: Peter Zijlstra 
---
 arch/x86/entry/calling.h | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 07a9331d55e7..a63d33f7f069 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -392,3 +392,21 @@ For 32-bit we have the following conventions - kernel is 
built with
 .endm
 
 #endif /* CONFIG_SMP */
+/*
+ * ENDBR is an instruction for the Indirect Branch Tracking (IBT) component
+ * of CET.  IBT prevents attacks by ensuring that (most) indirect branches
+ * function calls may only land at ENDBR instructions.  Branches that don't
+ * follow the rules will result in control flow (#CF) exceptions.
+ * ENDBR is a noop when IBT is unsupported or disabled.  Most ENDBR
+ * instructions are inserted automatically by the compiler, but branch
+ * targets written in assembly must have ENDBR added manually.
+ */
+.macro ENDBR
+#ifdef CONFIG_X86_CET
+#ifdef __i386__
+   endbr32
+#else
+   endbr64
+#endif
+#endif
+.endm
-- 
2.21.0



[PATCH v23 3/9] x86/cet/ibt: Handle signals for Indirect Branch Tracking

2021-03-16 Thread Yu-cheng Yu
When an indirect CALL/JMP instruction is executed and before it reaches
the target, it is in 'WAIT_ENDBR' status, which can be read from
MSR_IA32_U_CET.  The status is part of a task's status before a signal is
raised and preserved in the signal frame.  It is restored for sigreturn.

IBT state machine is described in Intel SDM Vol. 1, Sec. 18.3.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 arch/x86/kernel/cet.c| 26 --
 arch/x86/kernel/fpu/signal.c |  8 +---
 2 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index 3361706ba950..34a26eb7f259 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -300,6 +300,13 @@ void cet_restore_signal(struct sc_ext *sc_ext)
msr_val |= CET_SHSTK_EN;
}
 
+   if (cet->ibt_enabled) {
+   msr_val |= (CET_ENDBR_EN | CET_NO_TRACK_EN);
+
+   if (sc_ext->wait_endbr)
+   msr_val |= CET_WAIT_ENDBR;
+   }
+
if (test_thread_flag(TIF_NEED_FPU_LOAD))
cet_user_state->user_cet = msr_val;
else
@@ -340,9 +347,24 @@ int cet_setup_signal(bool ia32, unsigned long rstor_addr, 
struct sc_ext *sc_ext)
sc_ext->ssp = new_ssp;
}
 
-   if (ssp) {
+   if (ssp || cet->ibt_enabled) {
start_update_msrs();
-   wrmsrl(MSR_IA32_PL3_SSP, ssp);
+
+   if (ssp)
+   wrmsrl(MSR_IA32_PL3_SSP, ssp);
+
+   if (cet->ibt_enabled) {
+   u64 r;
+
+   rdmsrl(MSR_IA32_U_CET, r);
+
+   if (r & CET_WAIT_ENDBR) {
+   sc_ext->wait_endbr = 1;
+   r &= ~CET_WAIT_ENDBR;
+   wrmsrl(MSR_IA32_U_CET, r);
+   }
+   }
+
end_update_msrs();
}
 
diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c
index 270e4649f435..b914d74c8ba6 100644
--- a/arch/x86/kernel/fpu/signal.c
+++ b/arch/x86/kernel/fpu/signal.c
@@ -57,7 +57,8 @@ int save_cet_to_sigframe(int ia32, void __user *fp, unsigned 
long restorer)
 {
int err = 0;
 
-   if (!current->thread.cet.shstk_size)
+   if (!current->thread.cet.shstk_size &&
+   !current->thread.cet.ibt_enabled)
return 0;
 
if (fp) {
@@ -89,7 +90,8 @@ static int get_cet_from_sigframe(int ia32, void __user *fp, 
struct sc_ext *ext)
 
memset(ext, 0, sizeof(*ext));
 
-   if (!current->thread.cet.shstk_size)
+   if (!current->thread.cet.shstk_size &&
+   !current->thread.cet.ibt_enabled)
return 0;
 
if (fp) {
@@ -577,7 +579,7 @@ static unsigned long fpu__alloc_sigcontext_ext(unsigned 
long sp)
 * sigcontext_ext is at: fpu + fpu_user_xstate_size +
 * FP_XSTATE_MAGIC2_SIZE, then aligned to 8.
 */
-   if (cet->shstk_size)
+   if (cet->shstk_size || cet->ibt_enabled)
sp -= (sizeof(struct sc_ext) + 8);
 
return sp;
-- 
2.21.0



[PATCH v23 5/9] x86/cet/ibt: Update arch_prctl functions for Indirect Branch Tracking

2021-03-16 Thread Yu-cheng Yu
From: "H.J. Lu" 

Update ARCH_X86_CET_STATUS and ARCH_X86_CET_DISABLE for Indirect Branch
Tracking.

Signed-off-by: H.J. Lu 
Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 arch/x86/kernel/cet_prctl.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
index 0030c63a08c0..4df1eac41965 100644
--- a/arch/x86/kernel/cet_prctl.c
+++ b/arch/x86/kernel/cet_prctl.c
@@ -22,6 +22,9 @@ static int cet_copy_status_to_user(struct cet_status *cet, 
u64 __user *ubuf)
buf[2] = cet->shstk_size;
}
 
+   if (cet->ibt_enabled)
+   buf[0] |= GNU_PROPERTY_X86_FEATURE_1_IBT;
+
return copy_to_user(ubuf, buf, sizeof(buf));
 }
 
@@ -46,6 +49,8 @@ int prctl_cet(int option, u64 arg2)
return -EINVAL;
if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
cet_disable_shstk();
+   if (arg2 & GNU_PROPERTY_X86_FEATURE_1_IBT)
+   cet_disable_ibt();
return 0;
 
case ARCH_X86_CET_LOCK:
-- 
2.21.0



[PATCH v23 0/9] Control-flow Enforcement: Indirect Branch Tracking

2021-03-16 Thread Yu-cheng Yu
Control-flow Enforcement (CET) is a new Intel processor feature that blocks
return/jump-oriented programming attacks.  Details are in "Intel 64 and
IA-32 Architectures Software Developer's Manual" [1].

This is the second part of CET and enables Indirect Branch Tracking (IBT).
It is built on top of the shadow stack series.

Changes in v23:
- Add patch #6: introduce a macro for ENDBR instructions.
- Patch #7: replace endbr32 with ENDBR macro.
- Patch #9: revise, add/replace endbr64 with ENDBR macro.
- Rebase to Linus tree v5.12-rc3.

[1] Intel 64 and IA-32 Architectures Software Developer's Manual:

https://software.intel.com/en-us/download/intel-64-and-ia-32-
architectures-sdm-combined-volumes-1-2a-2b-2c-2d-3a-3b-3c-3d-and-4

[2] Indirect Branch Tracking patches v22:

https://lore.kernel.org/r/20210310220519.16811-1-yu-cheng...@intel.com/

H.J. Lu (3):
  x86/cet/ibt: Update arch_prctl functions for Indirect Branch Tracking
  x86/vdso/32: Add ENDBR to __kernel_vsyscall entry point
  x86/vdso: Insert endbr32/endbr64 to vDSO

Yu-cheng Yu (6):
  x86/cet/ibt: Update Kconfig for user-mode Indirect Branch Tracking
  x86/cet/ibt: User-mode Indirect Branch Tracking support
  x86/cet/ibt: Handle signals for Indirect Branch Tracking
  x86/cet/ibt: Update ELF header parsing for Indirect Branch Tracking
  x86/entry: Introduce ENDBR macro
  x86/vdso: Add ENDBR to __vdso_sgx_enter_enclave

 arch/x86/Kconfig |  1 +
 arch/x86/entry/calling.h | 18 
 arch/x86/entry/vdso/Makefile |  4 ++
 arch/x86/entry/vdso/vdso32/system_call.S |  2 +
 arch/x86/entry/vdso/vsgx.S   |  4 ++
 arch/x86/include/asm/cet.h   |  3 ++
 arch/x86/kernel/cet.c| 59 +++-
 arch/x86/kernel/cet_prctl.c  |  5 ++
 arch/x86/kernel/fpu/signal.c |  8 ++--
 arch/x86/kernel/process_64.c |  8 
 10 files changed, 107 insertions(+), 5 deletions(-)

-- 
2.21.0



[PATCH v23 2/9] x86/cet/ibt: User-mode Indirect Branch Tracking support

2021-03-16 Thread Yu-cheng Yu
Introduce user-mode Indirect Branch Tracking (IBT) support.  Add routines
for the setup/disable of IBT.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 arch/x86/include/asm/cet.h |  3 +++
 arch/x86/kernel/cet.c  | 33 +
 2 files changed, 36 insertions(+)

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index c2437378f339..c20c2f671145 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -15,6 +15,7 @@ struct cet_status {
unsigned long   shstk_base;
unsigned long   shstk_size;
unsigned intlocked:1;
+   unsigned intibt_enabled:1;
 };
 
 #ifdef CONFIG_X86_CET
@@ -27,6 +28,8 @@ void cet_free_shstk(struct task_struct *p);
 int cet_verify_rstor_token(bool ia32, unsigned long ssp, unsigned long 
*new_ssp);
 void cet_restore_signal(struct sc_ext *sc);
 int cet_setup_signal(bool ia32, unsigned long rstor, struct sc_ext *sc);
+int cet_setup_ibt(void);
+void cet_disable_ibt(void);
 #else
 static inline int prctl_cet(int option, u64 arg2) { return -EINVAL; }
 static inline int cet_setup_thread_shstk(struct task_struct *p,
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index 12738cdfb5f2..3361706ba950 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -13,6 +13,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -346,3 +348,34 @@ int cet_setup_signal(bool ia32, unsigned long rstor_addr, 
struct sc_ext *sc_ext)
 
return 0;
 }
+
+int cet_setup_ibt(void)
+{
+   u64 msr_val;
+
+   if (!static_cpu_has(X86_FEATURE_IBT))
+   return -EOPNOTSUPP;
+
+   start_update_msrs();
+   rdmsrl(MSR_IA32_U_CET, msr_val);
+   msr_val |= (CET_ENDBR_EN | CET_NO_TRACK_EN);
+   wrmsrl(MSR_IA32_U_CET, msr_val);
+   end_update_msrs();
+   current->thread.cet.ibt_enabled = 1;
+   return 0;
+}
+
+void cet_disable_ibt(void)
+{
+   u64 msr_val;
+
+   if (!static_cpu_has(X86_FEATURE_IBT))
+   return;
+
+   start_update_msrs();
+   rdmsrl(MSR_IA32_U_CET, msr_val);
+   msr_val &= ~CET_ENDBR_EN;
+   wrmsrl(MSR_IA32_U_CET, msr_val);
+   end_update_msrs();
+   current->thread.cet.ibt_enabled = 0;
+}
-- 
2.21.0



[PATCH v23 4/9] x86/cet/ibt: Update ELF header parsing for Indirect Branch Tracking

2021-03-16 Thread Yu-cheng Yu
An ELF file's .note.gnu.property indicates features the file supports.
The property is parsed at loading time and passed to arch_setup_elf_
property().  Update it for Indirect Branch Tracking.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 arch/x86/kernel/process_64.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index cda830b0f7ee..11497689a841 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -864,6 +864,14 @@ int arch_setup_elf_property(struct arch_elf_state *state)
r = cet_setup_shstk();
}
 
+   if (r < 0)
+   return r;
+
+   if (static_cpu_has(X86_FEATURE_IBT)) {
+   if (state->gnu_property & GNU_PROPERTY_X86_FEATURE_1_IBT)
+   r = cet_setup_ibt();
+   }
+
return r;
 }
 #endif
-- 
2.21.0



[PATCH v23 1/9] x86/cet/ibt: Update Kconfig for user-mode Indirect Branch Tracking

2021-03-16 Thread Yu-cheng Yu
Indirect branch tracking is a hardware security feature that verifies near
indirect call/jump instructions arrive at intended targets, which are
labeled by the compiler with ENDBR opcodes.  If such instructions reach
unlabeled locations, the processor raises control-protection faults.

Check the compiler is up-to-date at config time.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 arch/x86/Kconfig | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2c93178262f5..96000ed48469 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1953,6 +1953,7 @@ config X86_CET
def_bool n
depends on AS_WRUSS
depends on ARCH_HAS_SHADOW_STACK
+   depends on $(cc-option,-fcf-protection)
select ARCH_USES_HIGH_VMA_FLAGS
select ARCH_MAYBE_MKWRITE
select ARCH_USE_GNU_PROPERTY
-- 
2.21.0



[PATCH v23 28/28] mm: Introduce PROT_SHSTK for shadow stack

2021-03-16 Thread Yu-cheng Yu
There are three possible options to create a shadow stack allocation API:
an arch_prctl, a new syscall, or adding PROT_SHSTK to mmap()/mprotect().
Each has its advantages and compromises.

An arch_prctl() is the least intrusive.  However, the existing x86
arch_prctl() takes only two parameters.  Multiple parameters must be
passed in a memory buffer.  There is a proposal to pass more parameters in
registers [1], but no active discussion on that.

A new syscall minimizes compatibility issues and offers an extensible frame
work to other architectures, but this will likely result in some overlap of
mmap()/mprotect().

The introduction of PROT_SHSTK to mmap()/mprotect() takes advantage of
existing APIs.  The x86-specific PROT_SHSTK is translated to VM_SHSTK and
a shadow stack mapping is created without reinventing the wheel.  There are
potential pitfalls though.  The most obvious one would be using this as a
bypass to shadow stack protection.  However, the attacker would have to get
to the syscall first.

[1] https://lore.kernel.org/lkml/20200828121624.108243-1-hjl.to...@gmail.com/

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 arch/x86/include/asm/mman.h  | 57 +++-
 arch/x86/include/uapi/asm/mman.h |  1 +
 include/linux/mm.h   |  1 +
 mm/mmap.c|  8 -
 4 files changed, 65 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
index 629f6c81263a..bd94e30b5d34 100644
--- a/arch/x86/include/asm/mman.h
+++ b/arch/x86/include/asm/mman.h
@@ -20,11 +20,66 @@
((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
 
-#define arch_calc_vm_prot_bits(prot, key) (\
+#define pkey_vm_prot_bits(prot, key) ( \
((key) & 0x1 ? VM_PKEY_BIT0 : 0) |  \
((key) & 0x2 ? VM_PKEY_BIT1 : 0) |  \
((key) & 0x4 ? VM_PKEY_BIT2 : 0) |  \
((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#else
+#define pkey_vm_prot_bits(prot, key) (0)
+#endif
+
+static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot,
+  unsigned long pkey)
+{
+   unsigned long vm_prot_bits = pkey_vm_prot_bits(prot, pkey);
+
+   if (!(prot & PROT_WRITE) && (prot & PROT_SHSTK))
+   vm_prot_bits |= VM_SHSTK;
+
+   return vm_prot_bits;
+}
+
+#define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey)
+
+#ifdef CONFIG_X86_CET
+static inline bool arch_validate_prot(unsigned long prot, unsigned long addr)
+{
+   unsigned long valid = PROT_READ | PROT_WRITE | PROT_EXEC | PROT_SEM;
+
+   if (prot & ~(valid | PROT_SHSTK))
+   return false;
+
+   if (prot & PROT_SHSTK) {
+   struct vm_area_struct *vma;
+
+   if (!current->thread.cet.shstk_size)
+   return false;
+
+   /*
+* A shadow stack mapping is indirectly writable by only
+* the CALL and WRUSS instructions, but not other write
+* instructions).  PROT_SHSTK and PROT_WRITE are mutually
+* exclusive.
+*/
+   if (prot & PROT_WRITE)
+   return false;
+
+   vma = find_vma(current->mm, addr);
+   if (!vma)
+   return false;
+
+   /*
+* Shadow stack cannot be backed by a file or shared.
+*/
+   if (vma->vm_file || (vma->vm_flags & VM_SHARED))
+   return false;
+   }
+
+   return true;
+}
+
+#define arch_validate_prot arch_validate_prot
 #endif
 
 #endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index 3ce1923e6ed9..39bb7db344a6 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -4,6 +4,7 @@
 
 #define MAP_32BIT  0x40/* only give out 32bit addresses */
 
+#define PROT_SHSTK 0x10/* shadow stack pages */
 
 #include 
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e178be052419..40c4b0fe7cc4 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -342,6 +342,7 @@ extern unsigned int kobjsize(const void *objp);
 
 #if defined(CONFIG_X86)
 # define VM_PATVM_ARCH_1   /* PAT reserves whole VMA at 
once (x86) */
+# define VM_ARCH_CLEAR VM_SHSTK
 #elif defined(CONFIG_PPC)
 # define VM_SAOVM_ARCH_1   /* Strong Access Ordering 
(powerpc) */
 #elif defined(CONFIG_PARISC)
diff --git a/mm/mmap.c b/mm/mmap.c
index 99077171010b..934cb3cbe952 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1481,6 +1481,12 @@ unsigned long do_mmap(struct file *file, unsigned long 
addr,
struct inode

[PATCH v23 26/28] x86/cet/shstk: Add arch_prctl functions for shadow stack

2021-03-16 Thread Yu-cheng Yu
arch_prctl(ARCH_X86_CET_STATUS, u64 *args)
Get CET feature status.

The parameter 'args' is a pointer to a user buffer.  The kernel returns
the following information:

*args = shadow stack/IBT status
*(args + 1) = shadow stack base address
*(args + 2) = shadow stack size

32-bit binaries use the same interface, but only lower 32-bits of each
item.

arch_prctl(ARCH_X86_CET_DISABLE, unsigned int features)
Disable CET features specified in 'features'.  Return -EPERM if CET is
locked.

arch_prctl(ARCH_X86_CET_LOCK)
Lock in CET features.

Also change do_arch_prctl_common()'s parameter 'cpuid_enabled' to
'arg2', as it is now also passed to prctl_cet().

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 arch/x86/include/asm/cet.h|  3 ++
 arch/x86/include/uapi/asm/prctl.h |  4 +++
 arch/x86/kernel/Makefile  |  2 +-
 arch/x86/kernel/cet_prctl.c   | 60 +++
 arch/x86/kernel/process.c |  6 ++--
 5 files changed, 71 insertions(+), 4 deletions(-)
 create mode 100644 arch/x86/kernel/cet_prctl.c

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 5d66340c7a13..c2437378f339 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -14,9 +14,11 @@ struct sc_ext;
 struct cet_status {
unsigned long   shstk_base;
unsigned long   shstk_size;
+   unsigned intlocked:1;
 };
 
 #ifdef CONFIG_X86_CET
+int prctl_cet(int option, u64 arg2);
 int cet_setup_shstk(void);
 int cet_setup_thread_shstk(struct task_struct *p, unsigned long clone_flags,
   unsigned long stack_size);
@@ -26,6 +28,7 @@ int cet_verify_rstor_token(bool ia32, unsigned long ssp, 
unsigned long *new_ssp)
 void cet_restore_signal(struct sc_ext *sc);
 int cet_setup_signal(bool ia32, unsigned long rstor, struct sc_ext *sc);
 #else
+static inline int prctl_cet(int option, u64 arg2) { return -EINVAL; }
 static inline int cet_setup_thread_shstk(struct task_struct *p,
 unsigned long clone_flags,
 unsigned long stack_size) { return 0; }
diff --git a/arch/x86/include/uapi/asm/prctl.h 
b/arch/x86/include/uapi/asm/prctl.h
index 5a6aac9fa41f..9245bf629120 100644
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -14,4 +14,8 @@
 #define ARCH_MAP_VDSO_32   0x2002
 #define ARCH_MAP_VDSO_64   0x2003
 
+#define ARCH_X86_CET_STATUS0x3001
+#define ARCH_X86_CET_DISABLE   0x3002
+#define ARCH_X86_CET_LOCK  0x3003
+
 #endif /* _ASM_X86_PRCTL_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index ed77517e527a..c4b6ffe54915 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -150,7 +150,7 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER)+= 
unwind_frame.o
 obj-$(CONFIG_UNWINDER_GUESS)   += unwind_guess.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += sev-es.o
-obj-$(CONFIG_X86_CET)  += cet.o
+obj-$(CONFIG_X86_CET)  += cet.o cet_prctl.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/cet_prctl.c b/arch/x86/kernel/cet_prctl.c
new file mode 100644
index ..0030c63a08c0
--- /dev/null
+++ b/arch/x86/kernel/cet_prctl.c
@@ -0,0 +1,60 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+/* See Documentation/x86/intel_cet.rst. */
+
+static int cet_copy_status_to_user(struct cet_status *cet, u64 __user *ubuf)
+{
+   u64 buf[3] = {};
+
+   if (cet->shstk_size) {
+   buf[0] |= GNU_PROPERTY_X86_FEATURE_1_SHSTK;
+   buf[1] = cet->shstk_base;
+   buf[2] = cet->shstk_size;
+   }
+
+   return copy_to_user(ubuf, buf, sizeof(buf));
+}
+
+int prctl_cet(int option, u64 arg2)
+{
+   struct cet_status *cet;
+
+   if (!cpu_feature_enabled(X86_FEATURE_CET))
+   return -ENOTSUPP;
+
+   cet = ¤t->thread.cet;
+
+   if (option == ARCH_X86_CET_STATUS)
+   return cet_copy_status_to_user(cet, (u64 __user *)arg2);
+
+   switch (option) {
+   case ARCH_X86_CET_DISABLE:
+   if (cet->locked)
+   return -EPERM;
+
+   if (arg2 & ~GNU_PROPERTY_X86_FEATURE_1_VALID)
+   return -EINVAL;
+   if (arg2 & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+   cet_disable_shstk();
+   return 0;
+
+   case ARCH_X86_CET_LOCK:
+   if (arg2)
+   return -EINVAL;
+   cet->locked = 1;
+   return 0;
+
+   default:
+   return -ENOSYS;
+   }
+}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index b7c8fe2d93ec..8071b5d770fb 100644
--- a/arch/x86/kernel/proces

[PATCH v23 13/28] mm: Introduce VM_SHSTK for shadow stack memory

2021-03-16 Thread Yu-cheng Yu
A shadow stack PTE must be read-only and have _PAGE_DIRTY set.  However,
read-only and Dirty PTEs also exist for copy-on-write (COW) pages.  These
two cases are handled differently for page faults.  Introduce VM_SHSTK to
track shadow stack VMAs.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 Documentation/filesystems/proc.rst | 1 +
 arch/x86/mm/mmap.c | 2 ++
 fs/proc/task_mmu.c | 3 +++
 include/linux/mm.h | 8 
 4 files changed, 14 insertions(+)

diff --git a/Documentation/filesystems/proc.rst 
b/Documentation/filesystems/proc.rst
index 48fbfc336ebf..5d8a2d75c799 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -549,6 +549,7 @@ encoded manner. The codes are the following:
 mgmergable advise flag
 btarm64 BTI guarded page
 mtarm64 MTE allocation tags are enabled
+ssshadow stack page
 =====
 
 Note that there is no guarantee that every flag and associated mnemonic will
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c90c20904a60..a22c6b6fc607 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -165,6 +165,8 @@ unsigned long get_mmap_base(int is_legacy)
 
 const char *arch_vma_name(struct vm_area_struct *vma)
 {
+   if (vma->vm_flags & VM_SHSTK)
+   return "[shadow stack]";
return NULL;
 }
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e862cab69583..59d57425cc21 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -661,6 +661,9 @@ static void show_smap_vma_flags(struct seq_file *m, struct 
vm_area_struct *vma)
[ilog2(VM_PKEY_BIT4)]   = "",
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
+#ifdef CONFIG_X86_CET
+   [ilog2(VM_SHSTK)]   = "ss",
+#endif
};
size_t i;
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 64a71bf20536..a6c18c5752d6 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -312,11 +312,13 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_HIGH_ARCH_BIT_2 34  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_BIT_3 35  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_BIT_4 36  /* bit only usable on 64-bit 
architectures */
+#define VM_HIGH_ARCH_BIT_5 37  /* bit only usable on 64-bit 
architectures */
 #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0)
 #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1)
 #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2)
 #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3)
 #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4)
+#define VM_HIGH_ARCH_5 BIT(VM_HIGH_ARCH_BIT_5)
 #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */
 
 #ifdef CONFIG_ARCH_HAS_PKEYS
@@ -332,6 +334,12 @@ extern unsigned int kobjsize(const void *objp);
 #endif
 #endif /* CONFIG_ARCH_HAS_PKEYS */
 
+#ifdef CONFIG_X86_CET
+# define VM_SHSTK  VM_HIGH_ARCH_5
+#else
+# define VM_SHSTK  VM_NONE
+#endif
+
 #if defined(CONFIG_X86)
 # define VM_PATVM_ARCH_1   /* PAT reserves whole VMA at 
once (x86) */
 #elif defined(CONFIG_PPC)
-- 
2.21.0



[PATCH v23 27/28] mm: Move arch_calc_vm_prot_bits() to arch/x86/include/asm/mman.h

2021-03-16 Thread Yu-cheng Yu
To prepare changes to arch_calc_vm_prot_bits() in the next patch, and be
consistent with other architectures, move arch_vm_get_page_prot() and
arch_calc_vm_prot_bits() to arch/x86/include/asm/mman.h.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 arch/x86/include/asm/mman.h  | 30 ++
 arch/x86/include/uapi/asm/mman.h | 27 +++
 2 files changed, 33 insertions(+), 24 deletions(-)
 create mode 100644 arch/x86/include/asm/mman.h

diff --git a/arch/x86/include/asm/mman.h b/arch/x86/include/asm/mman.h
new file mode 100644
index ..629f6c81263a
--- /dev/null
+++ b/arch/x86/include/asm/mman.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_MMAN_H
+#define _ASM_X86_MMAN_H
+
+#include 
+#include 
+
+#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
+/*
+ * Take the 4 protection key bits out of the vma->vm_flags
+ * value and turn them in to the bits that we can put in
+ * to a pte.
+ *
+ * Only override these if Protection Keys are available
+ * (which is only on 64-bit).
+ */
+#define arch_vm_get_page_prot(vm_flags)__pgprot(   \
+   ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
+   ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
+   ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
+   ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
+
+#define arch_calc_vm_prot_bits(prot, key) (\
+   ((key) & 0x1 ? VM_PKEY_BIT0 : 0) |  \
+   ((key) & 0x2 ? VM_PKEY_BIT1 : 0) |  \
+   ((key) & 0x4 ? VM_PKEY_BIT2 : 0) |  \
+   ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
+#endif
+
+#endif /* _ASM_X86_MMAN_H */
diff --git a/arch/x86/include/uapi/asm/mman.h b/arch/x86/include/uapi/asm/mman.h
index d4a8d0424bfb..3ce1923e6ed9 100644
--- a/arch/x86/include/uapi/asm/mman.h
+++ b/arch/x86/include/uapi/asm/mman.h
@@ -1,31 +1,10 @@
 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-#ifndef _ASM_X86_MMAN_H
-#define _ASM_X86_MMAN_H
+#ifndef _UAPI_ASM_X86_MMAN_H
+#define _UAPI_ASM_X86_MMAN_H
 
 #define MAP_32BIT  0x40/* only give out 32bit addresses */
 
-#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS
-/*
- * Take the 4 protection key bits out of the vma->vm_flags
- * value and turn them in to the bits that we can put in
- * to a pte.
- *
- * Only override these if Protection Keys are available
- * (which is only on 64-bit).
- */
-#define arch_vm_get_page_prot(vm_flags)__pgprot(   \
-   ((vm_flags) & VM_PKEY_BIT0 ? _PAGE_PKEY_BIT0 : 0) | \
-   ((vm_flags) & VM_PKEY_BIT1 ? _PAGE_PKEY_BIT1 : 0) | \
-   ((vm_flags) & VM_PKEY_BIT2 ? _PAGE_PKEY_BIT2 : 0) | \
-   ((vm_flags) & VM_PKEY_BIT3 ? _PAGE_PKEY_BIT3 : 0))
-
-#define arch_calc_vm_prot_bits(prot, key) (\
-   ((key) & 0x1 ? VM_PKEY_BIT0 : 0) |  \
-   ((key) & 0x2 ? VM_PKEY_BIT1 : 0) |  \
-   ((key) & 0x4 ? VM_PKEY_BIT2 : 0) |  \
-   ((key) & 0x8 ? VM_PKEY_BIT3 : 0))
-#endif
 
 #include 
 
-#endif /* _ASM_X86_MMAN_H */
+#endif /* _UAPI_ASM_X86_MMAN_H */
-- 
2.21.0



[PATCH v23 25/28] x86/cet/shstk: Handle thread shadow stack

2021-03-16 Thread Yu-cheng Yu
The kernel allocates (and frees on thread exit) a new shadow stack for a
pthread child.

It is possible for the kernel to complete the clone syscall and set the
child's shadow stack pointer to NULL and let the child thread allocate
a shadow stack for itself.  There are two issues in this approach: It
is not compatible with existing code that does inline syscall and it
cannot handle signals before the child can successfully allocate a
shadow stack.

Use stack_size passed from clone3() syscall for thread shadow stack size,
but cap it to min(RLIMIT_STACK, 4 GB).  A compat-mode thread shadow stack
size is further reduced to 1/4.  This allows more threads to run in a 32-
bit address space.

Signed-off-by: Yu-cheng Yu 
---
 arch/x86/include/asm/cet.h |  5 +++
 arch/x86/include/asm/mmu_context.h |  3 ++
 arch/x86/kernel/cet.c  | 49 ++
 arch/x86/kernel/process.c  | 15 +++--
 4 files changed, 69 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
index 73435856ce54..5d66340c7a13 100644
--- a/arch/x86/include/asm/cet.h
+++ b/arch/x86/include/asm/cet.h
@@ -18,12 +18,17 @@ struct cet_status {
 
 #ifdef CONFIG_X86_CET
 int cet_setup_shstk(void);
+int cet_setup_thread_shstk(struct task_struct *p, unsigned long clone_flags,
+  unsigned long stack_size);
 void cet_disable_shstk(void);
 void cet_free_shstk(struct task_struct *p);
 int cet_verify_rstor_token(bool ia32, unsigned long ssp, unsigned long 
*new_ssp);
 void cet_restore_signal(struct sc_ext *sc);
 int cet_setup_signal(bool ia32, unsigned long rstor, struct sc_ext *sc);
 #else
+static inline int cet_setup_thread_shstk(struct task_struct *p,
+unsigned long clone_flags,
+unsigned long stack_size) { return 0; }
 static inline void cet_disable_shstk(void) {}
 static inline void cet_free_shstk(struct task_struct *p) {}
 static inline void cet_restore_signal(struct sc_ext *sc) { return; }
diff --git a/arch/x86/include/asm/mmu_context.h 
b/arch/x86/include/asm/mmu_context.h
index 27516046117a..e90bd2ee8498 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -11,6 +11,7 @@
 
 #include 
 #include 
+#include 
 #include 
 
 extern atomic64_t last_mm_ctx_id;
@@ -146,6 +147,8 @@ do {\
 #else
 #define deactivate_mm(tsk, mm) \
 do {   \
+   if (!tsk->vfork_done)   \
+   cet_free_shstk(tsk);\
load_gs_index(0);   \
loadsegment(fs, 0); \
 } while (0)
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
index 08e43d9b5176..12738cdfb5f2 100644
--- a/arch/x86/kernel/cet.c
+++ b/arch/x86/kernel/cet.c
@@ -172,6 +172,55 @@ int cet_setup_shstk(void)
return 0;
 }
 
+int cet_setup_thread_shstk(struct task_struct *tsk, unsigned long clone_flags,
+  unsigned long stack_size)
+{
+   unsigned long addr, size;
+   struct cet_user_state *state;
+   struct cet_status *cet = &tsk->thread.cet;
+
+   if (!cet->shstk_size)
+   return 0;
+
+   if ((clone_flags & (CLONE_VFORK | CLONE_VM)) != CLONE_VM)
+   return 0;
+
+   state = get_xsave_addr(&tsk->thread.fpu.state.xsave,
+  XFEATURE_CET_USER);
+
+   if (!state)
+   return -EINVAL;
+
+   if (stack_size == 0)
+   return -EINVAL;
+
+   /* Cap shadow stack size to 4 GB */
+   size = min(rlimit(RLIMIT_STACK), 1UL << 32);
+   size = min(size, stack_size);
+
+   /*
+* Compat-mode pthreads share a limited address space.
+* If each function call takes an average of four slots
+* stack space, allocate 1/4 of stack size for shadow stack.
+*/
+   if (in_compat_syscall())
+   size /= 4;
+   size = round_up(size, PAGE_SIZE);
+   addr = alloc_shstk(size, 0);
+
+   if (IS_ERR_VALUE(addr)) {
+   cet->shstk_base = 0;
+   cet->shstk_size = 0;
+   return PTR_ERR((void *)addr);
+   }
+
+   fpu__prepare_write(&tsk->thread.fpu);
+   state->user_ssp = (u64)(addr + size);
+   cet->shstk_base = addr;
+   cet->shstk_size = size;
+   return 0;
+}
+
 void cet_disable_shstk(void)
 {
struct cet_status *cet = ¤t->thread.cet;
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9c214d7085a4..b7c8fe2d93ec 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -43,6 +43,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "process.h"
 
@@ -109,6 +110,7 @@ void exit_thread(struct task_struct *tsk)
 

[PATCH v23 24/28] ELF: Introduce arch_setup_elf_property()

2021-03-16 Thread Yu-cheng Yu
An ELF file's .note.gnu.property indicates arch features supported by the
file.  These features are extracted by arch_parse_elf_property() and stored
in 'arch_elf_state'.

Introduce x86 feature definitions and arch_setup_elf_property(), which
enables such features.  The first use-case of this function is Shadow
Stack.

ARM64 is the other arch that has ARCH_USE_GNU_PROPERTY and arch_parse_elf_
property().  Add arch_setup_elf_property() for it.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
Cc: Mark Brown 
Cc: Catalin Marinas 
Cc: Dave Martin 
---
 arch/arm64/include/asm/elf.h |  5 +
 arch/x86/Kconfig |  2 ++
 arch/x86/include/asm/elf.h   | 13 +
 arch/x86/kernel/process_64.c | 32 
 fs/binfmt_elf.c  |  4 
 include/linux/elf.h  |  6 ++
 include/uapi/linux/elf.h |  9 +
 7 files changed, 71 insertions(+)

diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h
index 8d1c8dcb87fd..d37bc7915935 100644
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -281,6 +281,11 @@ static inline int arch_parse_elf_property(u32 type, const 
void *data,
return 0;
 }
 
+static inline int arch_setup_elf_property(struct arch_elf_state *arch)
+{
+   return 0;
+}
+
 static inline int arch_elf_pt_proc(void *ehdr, void *phdr,
   struct file *f, bool is_interp,
   struct arch_elf_state *state)
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 102212025993..2c93178262f5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1955,6 +1955,8 @@ config X86_CET
depends on ARCH_HAS_SHADOW_STACK
select ARCH_USES_HIGH_VMA_FLAGS
select ARCH_MAYBE_MKWRITE
+   select ARCH_USE_GNU_PROPERTY
+   select ARCH_BINFMT_ELF_STATE
help
  Control-flow protection is a set of hardware features which place
  additional restrictions on indirect branches.  These help
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 9224d40cdefe..6a131047be8a 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -390,6 +390,19 @@ extern int compat_arch_setup_additional_pages(struct 
linux_binprm *bprm,
 
 extern bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs);
 
+#ifdef CONFIG_ARCH_BINFMT_ELF_STATE
+struct arch_elf_state {
+   unsigned int gnu_property;
+};
+
+#define INIT_ARCH_ELF_STATE {  \
+   .gnu_property = 0,  \
+}
+
+#define arch_elf_pt_proc(ehdr, phdr, elf, interp, state) (0)
+#define arch_check_elf(ehdr, interp, interp_ehdr, state) (0)
+#endif
+
 /* Do not change the values. See get_align_mask() */
 enum align_flags {
ALIGN_VA_32 = BIT(0),
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index d08307df69ad..cda830b0f7ee 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -835,3 +835,35 @@ unsigned long KSTK_ESP(struct task_struct *task)
 {
return task_pt_regs(task)->sp;
 }
+
+#ifdef CONFIG_ARCH_USE_GNU_PROPERTY
+int arch_parse_elf_property(u32 type, const void *data, size_t datasz,
+   bool compat, struct arch_elf_state *state)
+{
+   if (type != GNU_PROPERTY_X86_FEATURE_1_AND)
+   return 0;
+
+   if (datasz != sizeof(unsigned int))
+   return -ENOEXEC;
+
+   state->gnu_property = *(unsigned int *)data;
+   return 0;
+}
+
+int arch_setup_elf_property(struct arch_elf_state *state)
+{
+   int r = 0;
+
+   if (!IS_ENABLED(CONFIG_X86_CET))
+   return r;
+
+   memset(¤t->thread.cet, 0, sizeof(struct cet_status));
+
+   if (static_cpu_has(X86_FEATURE_SHSTK)) {
+   if (state->gnu_property & GNU_PROPERTY_X86_FEATURE_1_SHSTK)
+   r = cet_setup_shstk();
+   }
+
+   return r;
+}
+#endif
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b12ba98ae9f5..fa665eceba04 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -1248,6 +1248,10 @@ static int load_elf_binary(struct linux_binprm *bprm)
 
set_binfmt(&elf_format);
 
+   retval = arch_setup_elf_property(&arch_state);
+   if (retval < 0)
+   goto out;
+
 #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter);
if (retval < 0)
diff --git a/include/linux/elf.h b/include/linux/elf.h
index c9a46c4e183b..be04d15e937f 100644
--- a/include/linux/elf.h
+++ b/include/linux/elf.h
@@ -92,9 +92,15 @@ static inline int arch_parse_elf_property(u32 type, const 
void *data,
 {
return 0;
 }
+
+static inline int arch_setup_elf_property(struct arch_elf_state *arch)
+{
+   return 0;
+}
 #else
 extern int arch_parse_elf_property(u32 type, const void *data, size_t datasz,
   bool compat, struct arch_elf_state *arch

[PATCH v23 18/28] mm/mmap: Add shadow stack pages to memory accounting

2021-03-16 Thread Yu-cheng Yu
Account shadow stack pages to stack memory.

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 arch/x86/mm/pgtable.c   |  7 +++
 include/linux/pgtable.h | 11 +++
 mm/mmap.c   |  5 +
 3 files changed, 23 insertions(+)

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 0f4fbf51a9fc..948d28c29964 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -895,3 +895,10 @@ int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
 
 #endif /* CONFIG_X86_64 */
 #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+#ifdef CONFIG_ARCH_HAS_SHADOW_STACK
+bool arch_shadow_stack_mapping(vm_flags_t vm_flags)
+{
+   return (vm_flags & VM_SHSTK);
+}
+#endif
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index cbd98484c4f1..487c08df4365 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1470,6 +1470,17 @@ static inline pmd_t arch_maybe_pmd_mkwrite(pmd_t pmd, 
struct vm_area_struct *vma
 #endif /* CONFIG_ARCH_MAYBE_MKWRITE */
 #endif /* CONFIG_MMU */
 
+#ifdef CONFIG_MMU
+#ifdef CONFIG_ARCH_HAS_SHADOW_STACK
+bool arch_shadow_stack_mapping(vm_flags_t vm_flags);
+#else
+static inline bool arch_shadow_stack_mapping(vm_flags_t vm_flags)
+{
+   return false;
+}
+#endif /* CONFIG_ARCH_HAS_SHADOW_STACK */
+#endif /* CONFIG_MMU */
+
 /*
  * Architecture PAGE_KERNEL_* fallbacks
  *
diff --git a/mm/mmap.c b/mm/mmap.c
index 3f287599a7a3..2ac67882ace2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1718,6 +1718,9 @@ static inline int accountable_mapping(struct file *file, 
vm_flags_t vm_flags)
if (file && is_file_hugepages(file))
return 0;
 
+   if (arch_shadow_stack_mapping(vm_flags))
+   return 1;
+
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
@@ -3387,6 +3390,8 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t 
flags, long npages)
mm->stack_vm += npages;
else if (is_data_mapping(flags))
mm->data_vm += npages;
+   else if (arch_shadow_stack_mapping(flags))
+   mm->stack_vm += npages;
 }
 
 static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
-- 
2.21.0



[PATCH v23 22/28] x86/cet/shstk: User-mode shadow stack support

2021-03-16 Thread Yu-cheng Yu
Introduce basic shadow stack enabling/disabling/allocation routines.
A task's shadow stack is allocated from memory with VM_SHSTK flag and has
a fixed size of min(RLIMIT_STACK, 4GB).

Signed-off-by: Yu-cheng Yu 
Reviewed-by: Kees Cook 
---
 arch/x86/include/asm/cet.h   |  28 ++
 arch/x86/include/asm/processor.h |   5 ++
 arch/x86/kernel/Makefile |   2 +
 arch/x86/kernel/cet.c| 147 +++
 4 files changed, 182 insertions(+)
 create mode 100644 arch/x86/include/asm/cet.h
 create mode 100644 arch/x86/kernel/cet.c

diff --git a/arch/x86/include/asm/cet.h b/arch/x86/include/asm/cet.h
new file mode 100644
index ..5750fbcbb952
--- /dev/null
+++ b/arch/x86/include/asm/cet.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_CET_H
+#define _ASM_X86_CET_H
+
+#ifndef __ASSEMBLY__
+#include 
+
+struct task_struct;
+/*
+ * Per-thread CET status
+ */
+struct cet_status {
+   unsigned long   shstk_base;
+   unsigned long   shstk_size;
+};
+
+#ifdef CONFIG_X86_CET
+int cet_setup_shstk(void);
+void cet_disable_shstk(void);
+void cet_free_shstk(struct task_struct *p);
+#else
+static inline void cet_disable_shstk(void) {}
+static inline void cet_free_shstk(struct task_struct *p) {}
+#endif
+
+#endif /* __ASSEMBLY__ */
+
+#endif /* _ASM_X86_CET_H */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index dc6d149bf851..3fce5062261b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -27,6 +27,7 @@ struct vm86;
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -535,6 +536,10 @@ struct thread_struct {
 
unsigned intsig_on_uaccess_err:1;
 
+#ifdef CONFIG_X86_CET
+   struct cet_status   cet;
+#endif
+
/* Floating point and extended processor state */
struct fpu  fpu;
/*
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ddf08351f0b..ed77517e527a 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -150,6 +150,8 @@ obj-$(CONFIG_UNWINDER_FRAME_POINTER)+= 
unwind_frame.o
 obj-$(CONFIG_UNWINDER_GUESS)   += unwind_guess.o
 
 obj-$(CONFIG_AMD_MEM_ENCRYPT)  += sev-es.o
+obj-$(CONFIG_X86_CET)  += cet.o
+
 ###
 # 64 bit specific files
 ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c
new file mode 100644
index ..d25a03215984
--- /dev/null
+++ b/arch/x86/kernel/cet.c
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * cet.c - Control-flow Enforcement (CET)
+ *
+ * Copyright (c) 2019, Intel Corporation.
+ * Yu-cheng Yu 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+static void start_update_msrs(void)
+{
+   fpregs_lock();
+   if (test_thread_flag(TIF_NEED_FPU_LOAD))
+   __fpregs_load_activate();
+}
+
+static void end_update_msrs(void)
+{
+   fpregs_unlock();
+}
+
+static unsigned long cet_get_shstk_addr(void)
+{
+   struct fpu *fpu = ¤t->thread.fpu;
+   unsigned long ssp = 0;
+
+   fpregs_lock();
+
+   if (fpregs_state_valid(fpu, smp_processor_id())) {
+   rdmsrl(MSR_IA32_PL3_SSP, ssp);
+   } else {
+   struct cet_user_state *p;
+
+   p = get_xsave_addr(&fpu->state.xsave, XFEATURE_CET_USER);
+   if (p)
+   ssp = p->user_ssp;
+   }
+
+   fpregs_unlock();
+   return ssp;
+}
+
+static unsigned long alloc_shstk(unsigned long size, int flags)
+{
+   struct mm_struct *mm = current->mm;
+   unsigned long addr, populate;
+
+   /* VM_SHSTK requires MAP_ANONYMOUS, MAP_PRIVATE */
+   flags |= MAP_ANONYMOUS | MAP_PRIVATE;
+
+   mmap_write_lock(mm);
+   addr = do_mmap(NULL, 0, size, PROT_READ, flags, VM_SHSTK, 0,
+  &populate, NULL);
+   mmap_write_unlock(mm);
+
+   if (populate)
+   mm_populate(addr, populate);
+
+   return addr;
+}
+
+int cet_setup_shstk(void)
+{
+   unsigned long addr, size;
+   struct cet_status *cet = ¤t->thread.cet;
+
+   if (!static_cpu_has(X86_FEATURE_SHSTK))
+   return -EOPNOTSUPP;
+
+   size = round_up(min(rlimit(RLIMIT_STACK), 1UL << 32), PAGE_SIZE);
+   addr = alloc_shstk(size, 0);
+
+   if (IS_ERR_VALUE(addr))
+   return PTR_ERR((void *)addr);
+
+   cet->shstk_base = addr;
+   cet->shstk_size = size;
+
+   start_update_msrs();
+   wrmsrl(MSR_IA32_PL3_SSP, addr + size);
+   wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN);
+   end_update_msrs();
+   return 0;
+}
+
+void cet_disable_shstk(void)
+{
+   struct cet_status *cet = ¤t->thread.cet;
+   u64 msr_val;
+
+   if (!static_cpu_has(X86_FEATURE_SHSTK) ||
+   !cet->shstk_si

  1   2   3   4   5   6   7   8   9   >