[PATCH v5 24/27] x86/mm: Make the x86 GOT read-only
The GOT is changed during early boot when relocations are applied. Make it read-only directly. This table exists only for PIE binary. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- include/asm-generic/vmlinux.lds.h | 12 1 file changed, 12 insertions(+) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index e373e2e10f6a..e5b0710fe693 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -314,6 +314,17 @@ __end_ro_after_init = .; #endif +#ifdef CONFIG_X86_PIE +#define RO_GOT_X86 \ + .got: AT(ADDR(.got) - LOAD_OFFSET) {\ + VMLINUX_SYMBOL(__start_got) = .;\ + *(.got);\ + VMLINUX_SYMBOL(__end_got) = .; \ + } +#else +#define RO_GOT_X86 +#endif + /* * Read only Data */ @@ -370,6 +381,7 @@ __end_builtin_fw = .; \ } \ \ + RO_GOT_X86 \ TRACEDATA \ \ /* Kernel symbol table: Normal symbols */ \ -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 24/27] x86/mm: Make the x86 GOT read-only
The GOT is changed during early boot when relocations are applied. Make it read-only directly. This table exists only for PIE binary. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- include/asm-generic/vmlinux.lds.h | 12 1 file changed, 12 insertions(+) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index e373e2e10f6a..e5b0710fe693 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -314,6 +314,17 @@ __end_ro_after_init = .; #endif +#ifdef CONFIG_X86_PIE +#define RO_GOT_X86 \ + .got: AT(ADDR(.got) - LOAD_OFFSET) {\ + VMLINUX_SYMBOL(__start_got) = .;\ + *(.got);\ + VMLINUX_SYMBOL(__end_got) = .; \ + } +#else +#define RO_GOT_X86 +#endif + /* * Read only Data */ @@ -370,6 +381,7 @@ __end_builtin_fw = .; \ } \ \ + RO_GOT_X86 \ TRACEDATA \ \ /* Kernel symbol table: Normal symbols */ \ -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 27/27] x86/kaslr: Add option to extend KASLR range from 1GB to 3GB
Add a new CONFIG_RANDOMIZE_BASE_LARGE option to benefit from PIE support. It increases the KASLR range from 1GB to 3GB. The new range stars at 0x just above the EFI memory region. This option is off by default. The boot code is adapted to create the appropriate page table spanning three PUD pages. The relocation table uses 64-bit integers generated with the updated relocation tool with the large-reloc option. Signed-off-by: Thomas Garnier --- arch/x86/Kconfig | 21 + arch/x86/boot/compressed/Makefile| 5 + arch/x86/boot/compressed/misc.c | 10 +- arch/x86/include/asm/page_64_types.h | 9 + arch/x86/kernel/head64.c | 15 --- arch/x86/kernel/head_64.S| 11 ++- 6 files changed, 66 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 42f77aff5df1..f6cb20a66e8a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2236,6 +2236,27 @@ config X86_PIE select DYNAMIC_MODULE_BASE select MODULE_REL_CRCS if MODVERSIONS +config RANDOMIZE_BASE_LARGE + bool "Increase the randomization range of the kernel image" + depends on X86_64 && RANDOMIZE_BASE + select X86_PIE + select X86_MODULE_PLTS if MODULES + default n + ---help--- + Build the kernel as a Position Independent Executable (PIE) and + increase the available randomization range from 1GB to 3GB. + + This option impacts performance on kernel CPU intensive workloads up + to 10% due to PIE generated code. Impact on user-mode processes and + typical usage would be significantly less (0.50% when you build the + kernel). + + The kernel and modules will generate slightly more assembly (1 to 2% + increase on the .text sections). The vmlinux binary will be + significantly smaller due to less relocations. + + If unsure say N + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fa42f895fdde..8497ebd5e078 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -116,7 +116,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs +# Large randomization require bigger relocation table +ifeq ($(CONFIG_RANDOMIZE_BASE_LARGE),y) +CMD_RELOCS = arch/x86/tools/relocs --large-reloc +else CMD_RELOCS = arch/x86/tools/relocs +endif quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< $(obj)/vmlinux.relocs: vmlinux FORCE diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 8dd1d5ccae58..28d17bd5bad8 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -171,10 +171,18 @@ void __puthex(unsigned long value) } #if CONFIG_X86_NEED_RELOCS + +/* Large randomization go lower than -2G and use large relocation table */ +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +typedef long rel_t; +#else +typedef int rel_t; +#endif + static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) { - int *reloc; + rel_t *reloc; unsigned long delta, map, ptr; unsigned long min_addr = (unsigned long)output; unsigned long max_addr = min_addr + (VO___bss_start - VO__text); diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6afac386a434..4a9f5ad945b4 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -46,7 +46,11 @@ #define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +#define __START_KERNEL_map _AC(0x, UL) +#else #define __START_KERNEL_map _AC(0x8000, UL) +#endif /* CONFIG_RANDOMIZE_BASE_LARGE */ /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ @@ -64,9 +68,14 @@ * 512MiB by default, leaving 1.5GiB for modules once the page tables * are fully set up. If kernel ASLR is configured, it can extend the * kernel page table mapping, reducing the size of the modules area. + * On PIE, we relocate the binary 2G lower so add this extra space. */ #if defined(CONFIG_RANDOMIZE_BASE) +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +#define KERNEL_IMAGE_SIZE (_AC(3, UL) * 1024 * 1024 * 1024) +#else #define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) +#endif #else #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 14bbbe592772..2276198dd2bd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -61,6 +61,7 @@ EXPORT_SYM
[PATCH v5 27/27] x86/kaslr: Add option to extend KASLR range from 1GB to 3GB
Add a new CONFIG_RANDOMIZE_BASE_LARGE option to benefit from PIE support. It increases the KASLR range from 1GB to 3GB. The new range stars at 0x just above the EFI memory region. This option is off by default. The boot code is adapted to create the appropriate page table spanning three PUD pages. The relocation table uses 64-bit integers generated with the updated relocation tool with the large-reloc option. Signed-off-by: Thomas Garnier --- arch/x86/Kconfig | 21 + arch/x86/boot/compressed/Makefile| 5 + arch/x86/boot/compressed/misc.c | 10 +- arch/x86/include/asm/page_64_types.h | 9 + arch/x86/kernel/head64.c | 15 --- arch/x86/kernel/head_64.S| 11 ++- 6 files changed, 66 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 42f77aff5df1..f6cb20a66e8a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2236,6 +2236,27 @@ config X86_PIE select DYNAMIC_MODULE_BASE select MODULE_REL_CRCS if MODVERSIONS +config RANDOMIZE_BASE_LARGE + bool "Increase the randomization range of the kernel image" + depends on X86_64 && RANDOMIZE_BASE + select X86_PIE + select X86_MODULE_PLTS if MODULES + default n + ---help--- + Build the kernel as a Position Independent Executable (PIE) and + increase the available randomization range from 1GB to 3GB. + + This option impacts performance on kernel CPU intensive workloads up + to 10% due to PIE generated code. Impact on user-mode processes and + typical usage would be significantly less (0.50% when you build the + kernel). + + The kernel and modules will generate slightly more assembly (1 to 2% + increase on the .text sections). The vmlinux binary will be + significantly smaller due to less relocations. + + If unsure say N + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fa42f895fdde..8497ebd5e078 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -116,7 +116,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs +# Large randomization require bigger relocation table +ifeq ($(CONFIG_RANDOMIZE_BASE_LARGE),y) +CMD_RELOCS = arch/x86/tools/relocs --large-reloc +else CMD_RELOCS = arch/x86/tools/relocs +endif quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< $(obj)/vmlinux.relocs: vmlinux FORCE diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 8dd1d5ccae58..28d17bd5bad8 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -171,10 +171,18 @@ void __puthex(unsigned long value) } #if CONFIG_X86_NEED_RELOCS + +/* Large randomization go lower than -2G and use large relocation table */ +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +typedef long rel_t; +#else +typedef int rel_t; +#endif + static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) { - int *reloc; + rel_t *reloc; unsigned long delta, map, ptr; unsigned long min_addr = (unsigned long)output; unsigned long max_addr = min_addr + (VO___bss_start - VO__text); diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 6afac386a434..4a9f5ad945b4 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -46,7 +46,11 @@ #define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +#define __START_KERNEL_map _AC(0x, UL) +#else #define __START_KERNEL_map _AC(0x8000, UL) +#endif /* CONFIG_RANDOMIZE_BASE_LARGE */ /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ @@ -64,9 +68,14 @@ * 512MiB by default, leaving 1.5GiB for modules once the page tables * are fully set up. If kernel ASLR is configured, it can extend the * kernel page table mapping, reducing the size of the modules area. + * On PIE, we relocate the binary 2G lower so add this extra space. */ #if defined(CONFIG_RANDOMIZE_BASE) +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +#define KERNEL_IMAGE_SIZE (_AC(3, UL) * 1024 * 1024 * 1024) +#else #define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) +#endif #else #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 14bbbe592772..2276198dd2bd 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -61,6 +61,7 @@ EXPORT_SYM
[PATCH v5 25/27] x86/pie: Add option to build the kernel as PIE
Add the CONFIG_X86_PIE option which builds the kernel as a Position Independent Executable (PIE). The kernel is currently build with the mcmodel=kernel option which forces it to stay on the top 2G of the virtual address space. With PIE, the kernel will be able to move below the current limit. The --emit-relocs linker option was kept instead of using -pie to limit the impact on mapped sections. Any incompatible relocation will be catch by the arch/x86/tools/relocs binary at compile time. If segment based stack cookies are enabled, try to use the compiler option to select the segment register. If not available, automatically enabled global stack cookie in auto mode. Otherwise, recommend compiler update or global stack cookie option. Performance/Size impact: Size of vmlinux (Default configuration): File size: - PIE disabled: +0.18% - PIE enabled: -1.977% (less relocations) .text section: - PIE disabled: same - PIE enabled: same Size of vmlinux (Ubuntu configuration): File size: - PIE disabled: +0.21% - PIE enabled: +10% .text section: - PIE disabled: same - PIE enabled: +0.001% The size increase is mainly due to not having access to the 32-bit signed relocation that can be used with mcmodel=kernel. A small part is due to reduced optimization for PIE code. This bug [1] was opened with gcc to provide a better code generation for kernel PIE. Hackbench (50% and 1600% on thread/process for pipe/sockets): - PIE disabled: no significant change (avg -/+ 0.5% on latest test). - PIE enabled: between -1% to +1% in average (default and Ubuntu config). Kernbench (average of 10 Half and Optimal runs): Elapsed Time: - PIE disabled: no significant change (avg -0.5%) - PIE enabled: average -0.5% to +0.5% System Time: - PIE disabled: no significant change (avg -0.1%) - PIE enabled: average -0.4% to +0.4%. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82303 Signed-off-by: Thomas Garnier merge pie --- arch/x86/Kconfig | 8 arch/x86/Makefile | 45 - 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f98a47662023..42f77aff5df1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2228,6 +2228,14 @@ config X86_GLOBAL_STACKPROTECTOR If unsure, say N +config X86_PIE + bool + depends on X86_64 + select DEFAULT_HIDDEN + select WEAK_PROVIDE_HIDDEN + select DYNAMIC_MODULE_BASE + select MODULE_REL_CRCS if MODVERSIONS + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 112ccdc99566..b3647d78c26e 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -60,6 +60,8 @@ endif KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow KBUILD_CFLAGS += $(call cc-option,-mno-avx,) +stackglobal := $(call cc-option-yn, -mstack-protector-guard=global) + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -135,7 +137,48 @@ else KBUILD_CFLAGS += -mno-red-zone ifdef CONFIG_X86_PIE +KBUILD_CFLAGS += -fPIE KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/x86/kernel/module.lds + +# Relax relocation in both CFLAGS and LDFLAGS to support older compilers +KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) +LDFLAGS_vmlinux += $(call ld-option,--no-relax) +KBUILD_LDFLAGS_MODULE += $(call ld-option,--no-relax) + +# Stack validation is not yet support due to self-referenced switches +ifdef CONFIG_STACK_VALIDATION +$(warning CONFIG_STACK_VALIDATION is not yet supported for x86_64 pie \ + build.) +SKIP_STACK_VALIDATION := 1 +export SKIP_STACK_VALIDATION +endif + +ifndef CONFIG_CC_STACKPROTECTOR_NONE +ifndef CONFIG_X86_GLOBAL_STACKPROTECTOR +stackseg-flag := -mstack-protector-guard-reg=%gs +ifeq ($(call cc-option-yn,$(stackseg-flag)),n) +# Try to enable global stack cookie if possible +ifeq ($(stackglobal), y) +$(warning Cannot use CONFIG_CC_STACKPROTECTOR_* while \ +building a position independent kernel. \ +Default to global stack protector \ +(CONFIG_X86_GLOBAL_STACKPROTECTOR).) +CONFIG_X86_GLOBAL_STACKPROTECTOR := y +KBUILD_CFLAGS += -DCONFIG_X86_GLOBAL_STACKPROTECTOR +KBUILD_AFLAGS += -DCONFIG_X86_GLOBAL_STACKPROTECTOR +else +$(error echo Cannot use \ +CONFIG_CC_STACKPROTECTOR_(REGULAR|STRONG|AUTO) \ +while building a position independent binary. \ +Update your compiler or use \ +CONFIG_X86_GLOBAL_STAC
[PATCH v5 26/27] x86/relocs: Add option to generate 64-bit relocations
The x86 relocation tool generates a list of 32-bit signed integers. There was no need to use 64-bit integers because all addresses where above the 2G top of the memory. This change add a large-reloc option to generate 64-bit unsigned integers. It can be used when the kernel plan to go below the top 2G and 32-bit integers are not enough. Signed-off-by: Thomas Garnier --- arch/x86/tools/relocs.c| 60 +++--- arch/x86/tools/relocs.h| 4 +-- arch/x86/tools/relocs_common.c | 15 ++--- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 29283ad3950f..a29eaac6 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -13,8 +13,14 @@ static Elf_Ehdr ehdr; +#if ELF_BITS == 64 +typedef uint64_t rel_off_t; +#else +typedef uint32_t rel_off_t; +#endif + struct relocs { - uint32_t*offset; + rel_off_t *offset; unsigned long count; unsigned long size; }; @@ -685,7 +691,7 @@ static void print_absolute_relocs(void) printf("\n"); } -static void add_reloc(struct relocs *r, uint32_t offset) +static void add_reloc(struct relocs *r, rel_off_t offset) { if (r->count == r->size) { unsigned long newsize = r->size + 5; @@ -1061,26 +1067,48 @@ static void sort_relocs(struct relocs *r) qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); } -static int write32(uint32_t v, FILE *f) +static int write32(rel_off_t rel, FILE *f) { - unsigned char buf[4]; + unsigned char buf[sizeof(uint32_t)]; + uint32_t v = (uint32_t)rel; put_unaligned_le32(v, buf); - return fwrite(buf, 1, 4, f) == 4 ? 0 : -1; + return fwrite(buf, 1, sizeof(buf), f) == sizeof(buf) ? 0 : -1; } -static int write32_as_text(uint32_t v, FILE *f) +static int write32_as_text(rel_off_t rel, FILE *f) { + uint32_t v = (uint32_t)rel; return fprintf(f, "\t.long 0x%08"PRIx32"\n", v) > 0 ? 0 : -1; } -static void emit_relocs(int as_text, int use_real_mode) +static int write64(rel_off_t rel, FILE *f) +{ + unsigned char buf[sizeof(uint64_t)]; + uint64_t v = (uint64_t)rel; + + put_unaligned_le64(v, buf); + return fwrite(buf, 1, sizeof(buf), f) == sizeof(buf) ? 0 : -1; +} + +static int write64_as_text(rel_off_t rel, FILE *f) +{ + uint64_t v = (uint64_t)rel; + return fprintf(f, "\t.quad 0x%016"PRIx64"\n", v) > 0 ? 0 : -1; +} + +static void emit_relocs(int as_text, int use_real_mode, int use_large_reloc) { int i; - int (*write_reloc)(uint32_t, FILE *) = write32; + int (*write_reloc)(rel_off_t, FILE *); int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname); + if (use_large_reloc) + write_reloc = write64; + else + write_reloc = write32; + #if ELF_BITS == 64 if (!use_real_mode) do_reloc = do_reloc64; @@ -1091,6 +1119,9 @@ static void emit_relocs(int as_text, int use_real_mode) do_reloc = do_reloc32; else do_reloc = do_reloc_real; + + /* Large relocations only for 64-bit */ + use_large_reloc = 0; #endif /* Collect up the relocations */ @@ -1114,8 +1145,13 @@ static void emit_relocs(int as_text, int use_real_mode) * gas will like. */ printf(".section \".data.reloc\",\"a\"\n"); - printf(".balign 4\n"); - write_reloc = write32_as_text; + if (use_large_reloc) { + printf(".balign 8\n"); + write_reloc = write64_as_text; + } else { + printf(".balign 4\n"); + write_reloc = write32_as_text; + } } if (use_real_mode) { @@ -1183,7 +1219,7 @@ static void print_reloc_info(void) void process(FILE *fp, int use_real_mode, int as_text, int show_absolute_syms, int show_absolute_relocs, -int show_reloc_info) +int show_reloc_info, int use_large_reloc) { regex_init(use_real_mode); read_ehdr(fp); @@ -1206,5 +1242,5 @@ void process(FILE *fp, int use_real_mode, int as_text, print_reloc_info(); return; } - emit_relocs(as_text, use_real_mode); + emit_relocs(as_text, use_real_mode, use_large_reloc); } diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h index 43c83c0fd22c..3d401da59df7 100644 --- a/arch/x86/tools/relocs.h +++ b/arch/x86/tools/relocs.h @@ -31,8 +31,8 @@ enum symtype { void process_32(FILE *fp, int use_real_mode, int as_text, int show_absolute_syms, int show_a
[PATCH v5 21/27] x86/ftrace: Adapt function tracing for PIE support
When using PIE with function tracing, the compiler generates a call through the GOT (call *__fentry__@GOTPCREL). This instruction takes 6-bytes instead of 5-bytes with a relative call. If PIE is enabled, replace the 6th byte of the GOT call by a 1-byte nop so ftrace can handle the previous 5-bytes as before. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/ftrace.c | 51 +- scripts/recordmcount.c | 79 +++- 2 files changed, 102 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 01ebcb6f263e..2194a5d3e095 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -102,7 +102,7 @@ static const unsigned char *ftrace_nop_replace(void) static int ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code) + unsigned const char *new_code) { unsigned char replaced[MCOUNT_INSN_SIZE]; @@ -135,6 +135,53 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, return 0; } +/* Bytes before call GOT offset */ +static const unsigned char got_call_preinsn[] = { 0xff, 0x15 }; + +static int +ftrace_modify_initial_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code) +{ + unsigned char replaced[MCOUNT_INSN_SIZE + 1]; + + /* +* If PIE is not enabled default to the original approach to code +* modification. +*/ + if (!IS_ENABLED(CONFIG_X86_PIE)) + return ftrace_modify_code_direct(ip, old_code, new_code); + + ftrace_expected = old_code; + + /* Ensure the instructions point to a call to the GOT */ + if (probe_kernel_read(replaced, (void *)ip, sizeof(replaced))) { + WARN_ONCE(1, "invalid function"); + return -EFAULT; + } + + if (memcmp(replaced, got_call_preinsn, sizeof(got_call_preinsn))) { + WARN_ONCE(1, "invalid function call"); + return -EINVAL; + } + + /* +* Build a nop slide with a 5-byte nop and 1-byte nop to keep the ftrace +* hooking algorithm working with the expected 5 bytes instruction. +*/ + memset(replaced, ideal_nops[1][0], sizeof(replaced)); + memcpy(replaced, new_code, MCOUNT_INSN_SIZE); + + ip = text_ip_addr(ip); + + if (probe_kernel_write((void *)ip, replaced, sizeof(replaced))) + return -EPERM; + + sync_core(); + + return 0; + +} + int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { @@ -153,7 +200,7 @@ int ftrace_make_nop(struct module *mod, * just modify the code directly. */ if (addr == MCOUNT_ADDR) - return ftrace_modify_code_direct(rec->ip, old, new); + return ftrace_modify_initial_code(rec->ip, old, new); ftrace_expected = NULL; diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 895c40e8679f..aa71b912958d 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -171,33 +171,9 @@ umalloc(size_t size) return addr; } -static unsigned char ideal_nop5_x86_64[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; -static unsigned char ideal_nop5_x86_32[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; -static unsigned char *ideal_nop; - static char rel_type_nop; - static int (*make_nop)(void *map, size_t const offset); - -static int make_nop_x86(void *map, size_t const offset) -{ - uint32_t *ptr; - unsigned char *op; - - /* Confirm we have 0xe8 0x0 0x0 0x0 0x0 */ - ptr = map + offset; - if (*ptr != 0) - return -1; - - op = map + offset - 1; - if (*op != 0xe8) - return -1; - - /* convert to nop */ - ulseek(fd_map, offset - 1, SEEK_SET); - uwrite(fd_map, ideal_nop, 5); - return 0; -} +static unsigned char *ideal_nop; static unsigned char ideal_nop4_arm_le[4] = { 0x00, 0x00, 0xa0, 0xe1 }; /* mov r0, r0 */ static unsigned char ideal_nop4_arm_be[4] = { 0xe1, 0xa0, 0x00, 0x00 }; /* mov r0, r0 */ @@ -447,6 +423,50 @@ static void MIPS64_r_info(Elf64_Rel *const rp, unsigned sym, unsigned type) }).r_info; } +static unsigned char ideal_nop5_x86_64[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; +static unsigned char ideal_nop6_x86_64[6] = { 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00 }; +static unsigned char ideal_nop5_x86_32[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; +static size_t ideal_nop_x86_size; + +static unsigned char stub_default_x86[2] = { 0xe8, 0x00 }; /* call relative */ +static unsigned char stub_got_x86[3] = { 0xff, 0x15, 0x00 }; /* call .got */ +static unsigned char *stub_x86; +static size_t st
[PATCH v5 25/27] x86/pie: Add option to build the kernel as PIE
Add the CONFIG_X86_PIE option which builds the kernel as a Position Independent Executable (PIE). The kernel is currently build with the mcmodel=kernel option which forces it to stay on the top 2G of the virtual address space. With PIE, the kernel will be able to move below the current limit. The --emit-relocs linker option was kept instead of using -pie to limit the impact on mapped sections. Any incompatible relocation will be catch by the arch/x86/tools/relocs binary at compile time. If segment based stack cookies are enabled, try to use the compiler option to select the segment register. If not available, automatically enabled global stack cookie in auto mode. Otherwise, recommend compiler update or global stack cookie option. Performance/Size impact: Size of vmlinux (Default configuration): File size: - PIE disabled: +0.18% - PIE enabled: -1.977% (less relocations) .text section: - PIE disabled: same - PIE enabled: same Size of vmlinux (Ubuntu configuration): File size: - PIE disabled: +0.21% - PIE enabled: +10% .text section: - PIE disabled: same - PIE enabled: +0.001% The size increase is mainly due to not having access to the 32-bit signed relocation that can be used with mcmodel=kernel. A small part is due to reduced optimization for PIE code. This bug [1] was opened with gcc to provide a better code generation for kernel PIE. Hackbench (50% and 1600% on thread/process for pipe/sockets): - PIE disabled: no significant change (avg -/+ 0.5% on latest test). - PIE enabled: between -1% to +1% in average (default and Ubuntu config). Kernbench (average of 10 Half and Optimal runs): Elapsed Time: - PIE disabled: no significant change (avg -0.5%) - PIE enabled: average -0.5% to +0.5% System Time: - PIE disabled: no significant change (avg -0.1%) - PIE enabled: average -0.4% to +0.4%. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82303 Signed-off-by: Thomas Garnier merge pie --- arch/x86/Kconfig | 8 arch/x86/Makefile | 45 - 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f98a47662023..42f77aff5df1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2228,6 +2228,14 @@ config X86_GLOBAL_STACKPROTECTOR If unsure, say N +config X86_PIE + bool + depends on X86_64 + select DEFAULT_HIDDEN + select WEAK_PROVIDE_HIDDEN + select DYNAMIC_MODULE_BASE + select MODULE_REL_CRCS if MODVERSIONS + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 112ccdc99566..b3647d78c26e 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -60,6 +60,8 @@ endif KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow KBUILD_CFLAGS += $(call cc-option,-mno-avx,) +stackglobal := $(call cc-option-yn, -mstack-protector-guard=global) + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -135,7 +137,48 @@ else KBUILD_CFLAGS += -mno-red-zone ifdef CONFIG_X86_PIE +KBUILD_CFLAGS += -fPIE KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/x86/kernel/module.lds + +# Relax relocation in both CFLAGS and LDFLAGS to support older compilers +KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) +LDFLAGS_vmlinux += $(call ld-option,--no-relax) +KBUILD_LDFLAGS_MODULE += $(call ld-option,--no-relax) + +# Stack validation is not yet support due to self-referenced switches +ifdef CONFIG_STACK_VALIDATION +$(warning CONFIG_STACK_VALIDATION is not yet supported for x86_64 pie \ + build.) +SKIP_STACK_VALIDATION := 1 +export SKIP_STACK_VALIDATION +endif + +ifndef CONFIG_CC_STACKPROTECTOR_NONE +ifndef CONFIG_X86_GLOBAL_STACKPROTECTOR +stackseg-flag := -mstack-protector-guard-reg=%gs +ifeq ($(call cc-option-yn,$(stackseg-flag)),n) +# Try to enable global stack cookie if possible +ifeq ($(stackglobal), y) +$(warning Cannot use CONFIG_CC_STACKPROTECTOR_* while \ +building a position independent kernel. \ +Default to global stack protector \ +(CONFIG_X86_GLOBAL_STACKPROTECTOR).) +CONFIG_X86_GLOBAL_STACKPROTECTOR := y +KBUILD_CFLAGS += -DCONFIG_X86_GLOBAL_STACKPROTECTOR +KBUILD_AFLAGS += -DCONFIG_X86_GLOBAL_STACKPROTECTOR +else +$(error echo Cannot use \ +CONFIG_CC_STACKPROTECTOR_(REGULAR|STRONG|AUTO) \ +while building a position independent binary. \ +Update your compiler or use \ +CONFIG_X86_GLOBAL_STAC
[PATCH v5 26/27] x86/relocs: Add option to generate 64-bit relocations
The x86 relocation tool generates a list of 32-bit signed integers. There was no need to use 64-bit integers because all addresses where above the 2G top of the memory. This change add a large-reloc option to generate 64-bit unsigned integers. It can be used when the kernel plan to go below the top 2G and 32-bit integers are not enough. Signed-off-by: Thomas Garnier --- arch/x86/tools/relocs.c| 60 +++--- arch/x86/tools/relocs.h| 4 +-- arch/x86/tools/relocs_common.c | 15 ++--- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 29283ad3950f..a29eaac6 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -13,8 +13,14 @@ static Elf_Ehdr ehdr; +#if ELF_BITS == 64 +typedef uint64_t rel_off_t; +#else +typedef uint32_t rel_off_t; +#endif + struct relocs { - uint32_t*offset; + rel_off_t *offset; unsigned long count; unsigned long size; }; @@ -685,7 +691,7 @@ static void print_absolute_relocs(void) printf("\n"); } -static void add_reloc(struct relocs *r, uint32_t offset) +static void add_reloc(struct relocs *r, rel_off_t offset) { if (r->count == r->size) { unsigned long newsize = r->size + 5; @@ -1061,26 +1067,48 @@ static void sort_relocs(struct relocs *r) qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); } -static int write32(uint32_t v, FILE *f) +static int write32(rel_off_t rel, FILE *f) { - unsigned char buf[4]; + unsigned char buf[sizeof(uint32_t)]; + uint32_t v = (uint32_t)rel; put_unaligned_le32(v, buf); - return fwrite(buf, 1, 4, f) == 4 ? 0 : -1; + return fwrite(buf, 1, sizeof(buf), f) == sizeof(buf) ? 0 : -1; } -static int write32_as_text(uint32_t v, FILE *f) +static int write32_as_text(rel_off_t rel, FILE *f) { + uint32_t v = (uint32_t)rel; return fprintf(f, "\t.long 0x%08"PRIx32"\n", v) > 0 ? 0 : -1; } -static void emit_relocs(int as_text, int use_real_mode) +static int write64(rel_off_t rel, FILE *f) +{ + unsigned char buf[sizeof(uint64_t)]; + uint64_t v = (uint64_t)rel; + + put_unaligned_le64(v, buf); + return fwrite(buf, 1, sizeof(buf), f) == sizeof(buf) ? 0 : -1; +} + +static int write64_as_text(rel_off_t rel, FILE *f) +{ + uint64_t v = (uint64_t)rel; + return fprintf(f, "\t.quad 0x%016"PRIx64"\n", v) > 0 ? 0 : -1; +} + +static void emit_relocs(int as_text, int use_real_mode, int use_large_reloc) { int i; - int (*write_reloc)(uint32_t, FILE *) = write32; + int (*write_reloc)(rel_off_t, FILE *); int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname); + if (use_large_reloc) + write_reloc = write64; + else + write_reloc = write32; + #if ELF_BITS == 64 if (!use_real_mode) do_reloc = do_reloc64; @@ -1091,6 +1119,9 @@ static void emit_relocs(int as_text, int use_real_mode) do_reloc = do_reloc32; else do_reloc = do_reloc_real; + + /* Large relocations only for 64-bit */ + use_large_reloc = 0; #endif /* Collect up the relocations */ @@ -1114,8 +1145,13 @@ static void emit_relocs(int as_text, int use_real_mode) * gas will like. */ printf(".section \".data.reloc\",\"a\"\n"); - printf(".balign 4\n"); - write_reloc = write32_as_text; + if (use_large_reloc) { + printf(".balign 8\n"); + write_reloc = write64_as_text; + } else { + printf(".balign 4\n"); + write_reloc = write32_as_text; + } } if (use_real_mode) { @@ -1183,7 +1219,7 @@ static void print_reloc_info(void) void process(FILE *fp, int use_real_mode, int as_text, int show_absolute_syms, int show_absolute_relocs, -int show_reloc_info) +int show_reloc_info, int use_large_reloc) { regex_init(use_real_mode); read_ehdr(fp); @@ -1206,5 +1242,5 @@ void process(FILE *fp, int use_real_mode, int as_text, print_reloc_info(); return; } - emit_relocs(as_text, use_real_mode); + emit_relocs(as_text, use_real_mode, use_large_reloc); } diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h index 43c83c0fd22c..3d401da59df7 100644 --- a/arch/x86/tools/relocs.h +++ b/arch/x86/tools/relocs.h @@ -31,8 +31,8 @@ enum symtype { void process_32(FILE *fp, int use_real_mode, int as_text, int show_absolute_syms, int show_a
[PATCH v5 21/27] x86/ftrace: Adapt function tracing for PIE support
When using PIE with function tracing, the compiler generates a call through the GOT (call *__fentry__@GOTPCREL). This instruction takes 6-bytes instead of 5-bytes with a relative call. If PIE is enabled, replace the 6th byte of the GOT call by a 1-byte nop so ftrace can handle the previous 5-bytes as before. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/ftrace.c | 51 +- scripts/recordmcount.c | 79 +++- 2 files changed, 102 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 01ebcb6f263e..2194a5d3e095 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -102,7 +102,7 @@ static const unsigned char *ftrace_nop_replace(void) static int ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code) + unsigned const char *new_code) { unsigned char replaced[MCOUNT_INSN_SIZE]; @@ -135,6 +135,53 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, return 0; } +/* Bytes before call GOT offset */ +static const unsigned char got_call_preinsn[] = { 0xff, 0x15 }; + +static int +ftrace_modify_initial_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code) +{ + unsigned char replaced[MCOUNT_INSN_SIZE + 1]; + + /* +* If PIE is not enabled default to the original approach to code +* modification. +*/ + if (!IS_ENABLED(CONFIG_X86_PIE)) + return ftrace_modify_code_direct(ip, old_code, new_code); + + ftrace_expected = old_code; + + /* Ensure the instructions point to a call to the GOT */ + if (probe_kernel_read(replaced, (void *)ip, sizeof(replaced))) { + WARN_ONCE(1, "invalid function"); + return -EFAULT; + } + + if (memcmp(replaced, got_call_preinsn, sizeof(got_call_preinsn))) { + WARN_ONCE(1, "invalid function call"); + return -EINVAL; + } + + /* +* Build a nop slide with a 5-byte nop and 1-byte nop to keep the ftrace +* hooking algorithm working with the expected 5 bytes instruction. +*/ + memset(replaced, ideal_nops[1][0], sizeof(replaced)); + memcpy(replaced, new_code, MCOUNT_INSN_SIZE); + + ip = text_ip_addr(ip); + + if (probe_kernel_write((void *)ip, replaced, sizeof(replaced))) + return -EPERM; + + sync_core(); + + return 0; + +} + int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { @@ -153,7 +200,7 @@ int ftrace_make_nop(struct module *mod, * just modify the code directly. */ if (addr == MCOUNT_ADDR) - return ftrace_modify_code_direct(rec->ip, old, new); + return ftrace_modify_initial_code(rec->ip, old, new); ftrace_expected = NULL; diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 895c40e8679f..aa71b912958d 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -171,33 +171,9 @@ umalloc(size_t size) return addr; } -static unsigned char ideal_nop5_x86_64[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; -static unsigned char ideal_nop5_x86_32[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; -static unsigned char *ideal_nop; - static char rel_type_nop; - static int (*make_nop)(void *map, size_t const offset); - -static int make_nop_x86(void *map, size_t const offset) -{ - uint32_t *ptr; - unsigned char *op; - - /* Confirm we have 0xe8 0x0 0x0 0x0 0x0 */ - ptr = map + offset; - if (*ptr != 0) - return -1; - - op = map + offset - 1; - if (*op != 0xe8) - return -1; - - /* convert to nop */ - ulseek(fd_map, offset - 1, SEEK_SET); - uwrite(fd_map, ideal_nop, 5); - return 0; -} +static unsigned char *ideal_nop; static unsigned char ideal_nop4_arm_le[4] = { 0x00, 0x00, 0xa0, 0xe1 }; /* mov r0, r0 */ static unsigned char ideal_nop4_arm_be[4] = { 0xe1, 0xa0, 0x00, 0x00 }; /* mov r0, r0 */ @@ -447,6 +423,50 @@ static void MIPS64_r_info(Elf64_Rel *const rp, unsigned sym, unsigned type) }).r_info; } +static unsigned char ideal_nop5_x86_64[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; +static unsigned char ideal_nop6_x86_64[6] = { 0x66, 0x0f, 0x1f, 0x44, 0x00, 0x00 }; +static unsigned char ideal_nop5_x86_32[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; +static size_t ideal_nop_x86_size; + +static unsigned char stub_default_x86[2] = { 0xe8, 0x00 }; /* call relative */ +static unsigned char stub_got_x86[3] = { 0xff, 0x15, 0x00 }; /* call .got */ +static unsigned char *stub_x86; +static size_t st
[PATCH v5 20/27] x86: Support global stack cookie
Add an off-by-default configuration option to use a global stack cookie instead of the default TLS. This configuration option will only be used with PIE binaries. For kernel stack cookie, the compiler uses the mcmodel=kernel to switch between the fs segment to gs segment. A PIE binary does not use mcmodel=kernel because it can be relocated anywhere, therefore the compiler will default to the fs segment register. This is fixed on the latest version of gcc. If the segment selector is available, it will be automatically added. If the automatic configuration was selected, a warning is written and the global variable stack cookie is used. If a specific stack mode was selected (regular or strong) and the compiler does not support selecting the segment register, an error is emitted. Signed-off-by: Thomas Garnier --- arch/x86/Kconfig | 12 arch/x86/Makefile | 9 + arch/x86/entry/entry_32.S | 3 ++- arch/x86/entry/entry_64.S | 3 ++- arch/x86/include/asm/processor.h | 3 ++- arch/x86/include/asm/stackprotector.h | 19 ++- arch/x86/kernel/asm-offsets.c | 3 ++- arch/x86/kernel/asm-offsets_32.c | 3 ++- arch/x86/kernel/asm-offsets_64.c | 3 ++- arch/x86/kernel/cpu/common.c | 3 ++- arch/x86/kernel/head_32.S | 3 ++- arch/x86/kernel/process.c | 5 + 12 files changed, 56 insertions(+), 13 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c4d64b19acff..f49725df7109 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2212,6 +2212,18 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING If unsure, leave at the default value. +config X86_GLOBAL_STACKPROTECTOR + bool "Stack cookie using a global variable" + depends on CC_STACKPROTECTOR_AUTO + default n + ---help--- + This option turns on the "stack-protector" GCC feature using a global + variable instead of a segment register. It is useful when the + compiler does not support custom segment registers when building a + position independent (PIE) binary. + + If unsure, say N + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a08e82856563..c2c221fd20d7 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -141,6 +141,15 @@ else KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time) endif +ifdef CONFIG_X86_GLOBAL_STACKPROTECTOR +ifeq ($(call cc-option, -mstack-protector-guard=global),) +$(error Cannot use CONFIG_X86_GLOBAL_STACKPROTECTOR: \ +-mstack-protector-guard=global not supported \ +by compiler) +endif +KBUILD_CFLAGS += -mstack-protector-guard=global +endif + ifdef CONFIG_X86_X32 x32_ld_ok := $(call try-run,\ /bin/echo -e '1: .quad 1b' | \ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2582881d19ce..4298307c4275 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -239,7 +239,8 @@ ENTRY(__switch_to_asm) movl%esp, TASK_threadsp(%eax) movlTASK_threadsp(%edx), %esp -#ifdef CONFIG_STACKPROTECTOR +#if defined(CONFIG_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) movlTASK_stack_canary(%edx), %ebx movl%ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2afd2e2a86db..a603a0505706 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -357,7 +357,8 @@ ENTRY(__switch_to_asm) movq%rsp, TASK_threadsp(%rdi) movqTASK_threadsp(%rsi), %rsp -#ifdef CONFIG_STACKPROTECTOR +#if defined(CONFIG_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) movqTASK_stack_canary(%rsi), %rbx movq%rbx, PER_CPU_VAR(irq_stack_union + stack_canary_offset) #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6ee253d279d9..a1979e15621f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -414,7 +414,8 @@ extern asmlinkage void ignore_sysret(void); void save_fsgs_for_kvm(void); #endif #else /* X86_64 */ -#ifdef CONFIG_STACKPROTECTOR +#if defined(CONFIG_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) /* * Make sure stack canary segment base is cached-aligned: * "For Intel Atom processors, avoid non zero segment base address diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 8ec97a62c245..4e120cf36782 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackpr
[PATCH v5 20/27] x86: Support global stack cookie
Add an off-by-default configuration option to use a global stack cookie instead of the default TLS. This configuration option will only be used with PIE binaries. For kernel stack cookie, the compiler uses the mcmodel=kernel to switch between the fs segment to gs segment. A PIE binary does not use mcmodel=kernel because it can be relocated anywhere, therefore the compiler will default to the fs segment register. This is fixed on the latest version of gcc. If the segment selector is available, it will be automatically added. If the automatic configuration was selected, a warning is written and the global variable stack cookie is used. If a specific stack mode was selected (regular or strong) and the compiler does not support selecting the segment register, an error is emitted. Signed-off-by: Thomas Garnier --- arch/x86/Kconfig | 12 arch/x86/Makefile | 9 + arch/x86/entry/entry_32.S | 3 ++- arch/x86/entry/entry_64.S | 3 ++- arch/x86/include/asm/processor.h | 3 ++- arch/x86/include/asm/stackprotector.h | 19 ++- arch/x86/kernel/asm-offsets.c | 3 ++- arch/x86/kernel/asm-offsets_32.c | 3 ++- arch/x86/kernel/asm-offsets_64.c | 3 ++- arch/x86/kernel/cpu/common.c | 3 ++- arch/x86/kernel/head_32.S | 3 ++- arch/x86/kernel/process.c | 5 + 12 files changed, 56 insertions(+), 13 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c4d64b19acff..f49725df7109 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2212,6 +2212,18 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING If unsure, leave at the default value. +config X86_GLOBAL_STACKPROTECTOR + bool "Stack cookie using a global variable" + depends on CC_STACKPROTECTOR_AUTO + default n + ---help--- + This option turns on the "stack-protector" GCC feature using a global + variable instead of a segment register. It is useful when the + compiler does not support custom segment registers when building a + position independent (PIE) binary. + + If unsure, say N + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/Makefile b/arch/x86/Makefile index a08e82856563..c2c221fd20d7 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -141,6 +141,15 @@ else KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time) endif +ifdef CONFIG_X86_GLOBAL_STACKPROTECTOR +ifeq ($(call cc-option, -mstack-protector-guard=global),) +$(error Cannot use CONFIG_X86_GLOBAL_STACKPROTECTOR: \ +-mstack-protector-guard=global not supported \ +by compiler) +endif +KBUILD_CFLAGS += -mstack-protector-guard=global +endif + ifdef CONFIG_X86_X32 x32_ld_ok := $(call try-run,\ /bin/echo -e '1: .quad 1b' | \ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 2582881d19ce..4298307c4275 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -239,7 +239,8 @@ ENTRY(__switch_to_asm) movl%esp, TASK_threadsp(%eax) movlTASK_threadsp(%edx), %esp -#ifdef CONFIG_STACKPROTECTOR +#if defined(CONFIG_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) movlTASK_stack_canary(%edx), %ebx movl%ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2afd2e2a86db..a603a0505706 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -357,7 +357,8 @@ ENTRY(__switch_to_asm) movq%rsp, TASK_threadsp(%rdi) movqTASK_threadsp(%rsi), %rsp -#ifdef CONFIG_STACKPROTECTOR +#if defined(CONFIG_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) movqTASK_stack_canary(%rsi), %rbx movq%rbx, PER_CPU_VAR(irq_stack_union + stack_canary_offset) #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6ee253d279d9..a1979e15621f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -414,7 +414,8 @@ extern asmlinkage void ignore_sysret(void); void save_fsgs_for_kvm(void); #endif #else /* X86_64 */ -#ifdef CONFIG_STACKPROTECTOR +#if defined(CONFIG_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) /* * Make sure stack canary segment base is cached-aligned: * "For Intel Atom processors, avoid non zero segment base address diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 8ec97a62c245..4e120cf36782 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackpr
[PATCH v5 16/27] compiler: Option to add PROVIDE_HIDDEN replacement for weak symbols
Provide an option to have a PROVIDE_HIDDEN (linker script) entry for each weak symbol. This option solves an error in x86_64 where the linker optimizes PIE generated code to be non-PIE because --emit-relocs was used instead of -pie (to reduce dynamic relocations). Signed-off-by: Thomas Garnier --- init/Kconfig| 7 +++ scripts/link-vmlinux.sh | 14 ++ 2 files changed, 21 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index d4f90cc38ede..2d7431a8b108 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1974,6 +1974,13 @@ config ASN1 inform it as to what tags are to be expected in a stream and what functions to call on what tags. +config WEAK_PROVIDE_HIDDEN + bool + help + Generate linker script PROVIDE_HIDDEN entries for all weak symbols. It + allows to prevent non-PIE code being replaced by the linker if the + emit-relocs option is used instead of PIE (useful for x86_64 PIE). + source "kernel/Kconfig.locks" config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 4bf811c09f59..f5d31119b9d7 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -142,6 +142,17 @@ kallsyms() ${CC} ${aflags} -c -o ${2} ${afile} } +gen_weak_provide_hidden() +{ +if [ -n "${CONFIG_WEAK_PROVIDE_HIDDEN}" ]; then +local pattern="s/^\s\+ w \(\w\+\)$/PROVIDE_HIDDEN(\1 = .);/gp" +echo -e "SECTIONS {\n. = _end;" > .tmp_vmlinux_hiddenld +${NM} ${1} | sed -n "${pattern}" >> .tmp_vmlinux_hiddenld +echo "}" >> .tmp_vmlinux_hiddenld +LDFLAGS_vmlinux="${LDFLAGS_vmlinux} -T .tmp_vmlinux_hiddenld" +fi +} + # Create map file with all symbols from ${1} # See mksymap for additional details mksysmap() @@ -226,6 +237,9 @@ modpost_link vmlinux.o # modpost vmlinux.o to check for section mismatches ${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o +# Generate weak linker script +gen_weak_provide_hidden vmlinux.o + kallsymso="" kallsyms_vmlinux="" if [ -n "${CONFIG_KALLSYMS}" ]; then -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 16/27] compiler: Option to add PROVIDE_HIDDEN replacement for weak symbols
Provide an option to have a PROVIDE_HIDDEN (linker script) entry for each weak symbol. This option solves an error in x86_64 where the linker optimizes PIE generated code to be non-PIE because --emit-relocs was used instead of -pie (to reduce dynamic relocations). Signed-off-by: Thomas Garnier --- init/Kconfig| 7 +++ scripts/link-vmlinux.sh | 14 ++ 2 files changed, 21 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index d4f90cc38ede..2d7431a8b108 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1974,6 +1974,13 @@ config ASN1 inform it as to what tags are to be expected in a stream and what functions to call on what tags. +config WEAK_PROVIDE_HIDDEN + bool + help + Generate linker script PROVIDE_HIDDEN entries for all weak symbols. It + allows to prevent non-PIE code being replaced by the linker if the + emit-relocs option is used instead of PIE (useful for x86_64 PIE). + source "kernel/Kconfig.locks" config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 4bf811c09f59..f5d31119b9d7 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -142,6 +142,17 @@ kallsyms() ${CC} ${aflags} -c -o ${2} ${afile} } +gen_weak_provide_hidden() +{ +if [ -n "${CONFIG_WEAK_PROVIDE_HIDDEN}" ]; then +local pattern="s/^\s\+ w \(\w\+\)$/PROVIDE_HIDDEN(\1 = .);/gp" +echo -e "SECTIONS {\n. = _end;" > .tmp_vmlinux_hiddenld +${NM} ${1} | sed -n "${pattern}" >> .tmp_vmlinux_hiddenld +echo "}" >> .tmp_vmlinux_hiddenld +LDFLAGS_vmlinux="${LDFLAGS_vmlinux} -T .tmp_vmlinux_hiddenld" +fi +} + # Create map file with all symbols from ${1} # See mksymap for additional details mksysmap() @@ -226,6 +237,9 @@ modpost_link vmlinux.o # modpost vmlinux.o to check for section mismatches ${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o +# Generate weak linker script +gen_weak_provide_hidden vmlinux.o + kallsymso="" kallsyms_vmlinux="" if [ -n "${CONFIG_KALLSYMS}" ]; then -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 19/27] kvm: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. The new __ASM_MOVABS macro is used to get the address of a symbol on both 32 and 64-bit with PIE support. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/kvm_host.h | 8 ++-- arch/x86/kernel/kvm.c | 6 -- arch/x86/kvm/svm.c | 4 ++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c13cd28d9d1b..27370f4917a4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1390,9 +1390,13 @@ asmlinkage void kvm_spurious_fault(void); ".pushsection .fixup, \"ax\" \n" \ "667: \n\t" \ cleanup_insn "\n\t" \ - "cmpb $0, kvm_rebooting \n\t" \ + "cmpb $0, kvm_rebooting" __ASM_SEL(,(%%rip)) " \n\t" \ "jne 668b \n\t" \ - __ASM_SIZE(push) " $666b \n\t"\ + __ASM_SIZE(push) "$0 \n\t" \ + __ASM_SIZE(push) "%%" _ASM_AX " \n\t" \ + _ASM_MOVABS " $666b, %%" _ASM_AX "\n\t" \ + _ASM_MOV " %%" _ASM_AX ", " __ASM_SEL(4,8) "(%%" _ASM_SP ") \n\t" \ + __ASM_SIZE(pop) "%%" _ASM_AX " \n\t"\ "call kvm_spurious_fault \n\t"\ ".popsection \n\t" \ _ASM_EXTABLE(666b, 667b) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a37bda38d205..761157c138c9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -726,8 +726,10 @@ asm( ".global __raw_callee_save___kvm_vcpu_is_preempted;" ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" "__raw_callee_save___kvm_vcpu_is_preempted:" -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"leaq __per_cpu_offset(%rip), %rax;" +"movq (%rax,%rdi,8), %rax;" +"addq " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rip), %rax;" +"cmpb $0, (%rax);" "setne %al;" "ret;" ".popsection"); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f059a73f0fd0..cb78647d6383 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -707,12 +707,12 @@ static u32 svm_msrpm_offset(u32 msr) static inline void clgi(void) { - asm volatile (__ex(SVM_CLGI)); + asm volatile (__ex(SVM_CLGI) : :); } static inline void stgi(void) { - asm volatile (__ex(SVM_STGI)); + asm volatile (__ex(SVM_STGI) : :); } static inline void invlpga(unsigned long addr, u32 asid) -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 19/27] kvm: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. The new __ASM_MOVABS macro is used to get the address of a symbol on both 32 and 64-bit with PIE support. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/kvm_host.h | 8 ++-- arch/x86/kernel/kvm.c | 6 -- arch/x86/kvm/svm.c | 4 ++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c13cd28d9d1b..27370f4917a4 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1390,9 +1390,13 @@ asmlinkage void kvm_spurious_fault(void); ".pushsection .fixup, \"ax\" \n" \ "667: \n\t" \ cleanup_insn "\n\t" \ - "cmpb $0, kvm_rebooting \n\t" \ + "cmpb $0, kvm_rebooting" __ASM_SEL(,(%%rip)) " \n\t" \ "jne 668b \n\t" \ - __ASM_SIZE(push) " $666b \n\t"\ + __ASM_SIZE(push) "$0 \n\t" \ + __ASM_SIZE(push) "%%" _ASM_AX " \n\t" \ + _ASM_MOVABS " $666b, %%" _ASM_AX "\n\t" \ + _ASM_MOV " %%" _ASM_AX ", " __ASM_SEL(4,8) "(%%" _ASM_SP ") \n\t" \ + __ASM_SIZE(pop) "%%" _ASM_AX " \n\t"\ "call kvm_spurious_fault \n\t"\ ".popsection \n\t" \ _ASM_EXTABLE(666b, 667b) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a37bda38d205..761157c138c9 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -726,8 +726,10 @@ asm( ".global __raw_callee_save___kvm_vcpu_is_preempted;" ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" "__raw_callee_save___kvm_vcpu_is_preempted:" -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"leaq __per_cpu_offset(%rip), %rax;" +"movq (%rax,%rdi,8), %rax;" +"addq " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rip), %rax;" +"cmpb $0, (%rax);" "setne %al;" "ret;" ".popsection"); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f059a73f0fd0..cb78647d6383 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -707,12 +707,12 @@ static u32 svm_msrpm_offset(u32 msr) static inline void clgi(void) { - asm volatile (__ex(SVM_CLGI)); + asm volatile (__ex(SVM_CLGI) : :); } static inline void stgi(void) { - asm volatile (__ex(SVM_STGI)); + asm volatile (__ex(SVM_STGI) : :); } static inline void invlpga(unsigned long addr, u32 asid) -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 15/27] compiler: Option to default to hidden symbols
Provide an option to default visibility to hidden except for key symbols. This option is disabled by default and will be used by x86_64 PIE support to remove errors between compilation units. The default visibility is also enabled for external symbols that are compared as they maybe equals (start/end of sections). In this case, older versions of GCC will remove the comparison if the symbols are hidden. This issue exists at least on gcc 4.9 and before. Signed-off-by: Thomas Garnier --- arch/x86/boot/boot.h | 2 +- arch/x86/include/asm/setup.h | 2 +- arch/x86/kernel/cpu/microcode/core.c | 4 ++-- drivers/base/firmware_loader/main.c | 4 ++-- include/asm-generic/sections.h | 6 ++ include/linux/compiler.h | 7 +++ init/Kconfig | 7 +++ kernel/kallsyms.c| 16 kernel/trace/trace.h | 4 ++-- lib/dynamic_debug.c | 4 ++-- 10 files changed, 38 insertions(+), 18 deletions(-) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index ef5a9cc66fb8..d726c35bdd96 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -193,7 +193,7 @@ static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len) } /* Heap -- available for dynamic lists. */ -extern char _end[]; +extern char _end[] __default_visibility; extern char *HEAP; extern char *heap_end; #define RESET_HEAP() ((void *)( HEAP = _end )) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ae13bc974416..083a6e99b884 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -68,7 +68,7 @@ static inline void x86_ce4100_early_setup(void) { } * This is set up by the setup-routine at boot-time */ extern struct boot_params boot_params; -extern char _text[]; +extern char _text[] __default_visibility; static inline bool kaslr_enabled(void) { diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 08286269fd24..3ed2f9d54abb 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -149,8 +149,8 @@ static bool __init check_loader_disabled_bsp(void) return *res; } -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; +extern struct builtin_fw __start_builtin_fw[] __default_visibility; +extern struct builtin_fw __end_builtin_fw[] __default_visibility; bool get_builtin_firmware(struct cpio_data *cd, const char *name) { diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 0943e7065e0e..2ffd019af2d4 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -94,8 +94,8 @@ static struct firmware_cache fw_cache; #ifdef CONFIG_FW_LOADER -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; +extern struct builtin_fw __start_builtin_fw[] __default_visibility; +extern struct builtin_fw __end_builtin_fw[] __default_visibility; static void fw_copy_to_prealloc_buf(struct firmware *fw, void *buf, size_t size) diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 849cd8eb5ca0..0a0e23405ddd 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -32,6 +32,9 @@ * __softirqentry_text_start, __softirqentry_text_end * __start_opd, __end_opd */ +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility push(default) +#endif extern char _text[], _stext[], _etext[]; extern char _data[], _sdata[], _edata[]; extern char __bss_start[], __bss_stop[]; @@ -49,6 +52,9 @@ extern char __start_once[], __end_once[]; /* Start and end of .ctors section - used for constructor calls. */ extern char __ctors_start[], __ctors_end[]; +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility pop +#endif /* Start and end of .opd section - used for function descriptors. */ extern char __start_opd[], __end_opd[]; diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 42506e4d1f53..d9837a58906e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -278,6 +278,13 @@ unsigned long read_word_at_a_time(const void *addr) __u.__val; \ }) +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility push(hidden) +#define __default_visibility __attribute__((visibility ("default"))) +#else +#define __default_visibility +#endif + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/init/Kconfig b/init/Kconfig index 24b60536e26b..d4f90cc38ede 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1718,6 +1718,13 @@ config PROFILING config TRACEPOINTS bool +# +# Default to hidden visibility for all symbols. +# Useful for Position Independent Code to reduce global references. +# +config DEFAULT_HIDDEN + bool + source &qu
[PATCH v5 15/27] compiler: Option to default to hidden symbols
Provide an option to default visibility to hidden except for key symbols. This option is disabled by default and will be used by x86_64 PIE support to remove errors between compilation units. The default visibility is also enabled for external symbols that are compared as they maybe equals (start/end of sections). In this case, older versions of GCC will remove the comparison if the symbols are hidden. This issue exists at least on gcc 4.9 and before. Signed-off-by: Thomas Garnier --- arch/x86/boot/boot.h | 2 +- arch/x86/include/asm/setup.h | 2 +- arch/x86/kernel/cpu/microcode/core.c | 4 ++-- drivers/base/firmware_loader/main.c | 4 ++-- include/asm-generic/sections.h | 6 ++ include/linux/compiler.h | 7 +++ init/Kconfig | 7 +++ kernel/kallsyms.c| 16 kernel/trace/trace.h | 4 ++-- lib/dynamic_debug.c | 4 ++-- 10 files changed, 38 insertions(+), 18 deletions(-) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index ef5a9cc66fb8..d726c35bdd96 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -193,7 +193,7 @@ static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len) } /* Heap -- available for dynamic lists. */ -extern char _end[]; +extern char _end[] __default_visibility; extern char *HEAP; extern char *heap_end; #define RESET_HEAP() ((void *)( HEAP = _end )) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ae13bc974416..083a6e99b884 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -68,7 +68,7 @@ static inline void x86_ce4100_early_setup(void) { } * This is set up by the setup-routine at boot-time */ extern struct boot_params boot_params; -extern char _text[]; +extern char _text[] __default_visibility; static inline bool kaslr_enabled(void) { diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 08286269fd24..3ed2f9d54abb 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -149,8 +149,8 @@ static bool __init check_loader_disabled_bsp(void) return *res; } -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; +extern struct builtin_fw __start_builtin_fw[] __default_visibility; +extern struct builtin_fw __end_builtin_fw[] __default_visibility; bool get_builtin_firmware(struct cpio_data *cd, const char *name) { diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 0943e7065e0e..2ffd019af2d4 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -94,8 +94,8 @@ static struct firmware_cache fw_cache; #ifdef CONFIG_FW_LOADER -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; +extern struct builtin_fw __start_builtin_fw[] __default_visibility; +extern struct builtin_fw __end_builtin_fw[] __default_visibility; static void fw_copy_to_prealloc_buf(struct firmware *fw, void *buf, size_t size) diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 849cd8eb5ca0..0a0e23405ddd 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -32,6 +32,9 @@ * __softirqentry_text_start, __softirqentry_text_end * __start_opd, __end_opd */ +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility push(default) +#endif extern char _text[], _stext[], _etext[]; extern char _data[], _sdata[], _edata[]; extern char __bss_start[], __bss_stop[]; @@ -49,6 +52,9 @@ extern char __start_once[], __end_once[]; /* Start and end of .ctors section - used for constructor calls. */ extern char __ctors_start[], __ctors_end[]; +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility pop +#endif /* Start and end of .opd section - used for function descriptors. */ extern char __start_opd[], __end_opd[]; diff --git a/include/linux/compiler.h b/include/linux/compiler.h index 42506e4d1f53..d9837a58906e 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -278,6 +278,13 @@ unsigned long read_word_at_a_time(const void *addr) __u.__val; \ }) +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility push(hidden) +#define __default_visibility __attribute__((visibility ("default"))) +#else +#define __default_visibility +#endif + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/init/Kconfig b/init/Kconfig index 24b60536e26b..d4f90cc38ede 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1718,6 +1718,13 @@ config PROFILING config TRACEPOINTS bool +# +# Default to hidden visibility for all symbols. +# Useful for Position Independent Code to reduce global references. +# +config DEFAULT_HIDDEN + bool + source &qu
[PATCH v5 13/27] x86/boot/64: Build head64.c as mcmodel large when PIE is enabled
The __startup_64 function assumes all symbols have relocated addresses instead of the current boot virtual address. PIE generated code favor relative addresses making all virtual and physical address math incorrect. If PIE is enabled, build head64.c as mcmodel large instead to ensure absolute references on all memory access. Add a global __force_order variable required when using a large model with read_cr* functions. To build head64.c as mcmodel=large, disable the retpoline gcc flags. This code is used at early boot and removed later, it doesn't need retpoline mitigation. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/Makefile | 6 ++ arch/x86/kernel/head64.c | 3 +++ 2 files changed, 9 insertions(+) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 02d6f5cf4e70..0f6da4b216e0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,6 +22,12 @@ CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg endif +ifdef CONFIG_X86_PIE +# Remove PIE and retpoline flags that are incompatible with mcmodel=large +CFLAGS_REMOVE_head64.o += -fPIE -mindirect-branch=thunk-extern -mindirect-branch-register +CFLAGS_head64.o = -mcmodel=large +endif + KASAN_SANITIZE_head$(BITS).o := n KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8047379e575a..49df0386098c 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -62,6 +62,9 @@ EXPORT_SYMBOL(vmemmap_base); #define __head __section(.head.text) +/* Required for read_cr3 when building as PIE */ +unsigned long __force_order; + static void __head *fixup_pointer(void *ptr, unsigned long physaddr) { return ptr - (void *)_text + (void *)physaddr; -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 13/27] x86/boot/64: Build head64.c as mcmodel large when PIE is enabled
The __startup_64 function assumes all symbols have relocated addresses instead of the current boot virtual address. PIE generated code favor relative addresses making all virtual and physical address math incorrect. If PIE is enabled, build head64.c as mcmodel large instead to ensure absolute references on all memory access. Add a global __force_order variable required when using a large model with read_cr* functions. To build head64.c as mcmodel=large, disable the retpoline gcc flags. This code is used at early boot and removed later, it doesn't need retpoline mitigation. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/Makefile | 6 ++ arch/x86/kernel/head64.c | 3 +++ 2 files changed, 9 insertions(+) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 02d6f5cf4e70..0f6da4b216e0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,6 +22,12 @@ CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg endif +ifdef CONFIG_X86_PIE +# Remove PIE and retpoline flags that are incompatible with mcmodel=large +CFLAGS_REMOVE_head64.o += -fPIE -mindirect-branch=thunk-extern -mindirect-branch-register +CFLAGS_head64.o = -mcmodel=large +endif + KASAN_SANITIZE_head$(BITS).o := n KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 8047379e575a..49df0386098c 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -62,6 +62,9 @@ EXPORT_SYMBOL(vmemmap_base); #define __head __section(.head.text) +/* Required for read_cr3 when building as PIE */ +unsigned long __force_order; + static void __head *fixup_pointer(void *ptr, unsigned long physaddr) { return ptr - (void *)_text + (void *)physaddr; -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 06/27] x86/entry/64: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/entry/entry_64.S| 18 -- arch/x86/kernel/relocate_kernel_64.S | 8 +++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c6f3677e6105..e738d8d0e308 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -191,7 +191,7 @@ ENTRY(entry_SYSCALL_64_trampoline) * spill RDI and restore it in a second-stage trampoline. */ pushq %rdi - movq$entry_SYSCALL_64_stage2, %rdi + movabsq $entry_SYSCALL_64_stage2, %rdi JMP_NOSPEC %rdi END(entry_SYSCALL_64_trampoline) @@ -1277,7 +1277,8 @@ ENTRY(error_entry) movl%ecx, %eax /* zero extend */ cmpq%rax, RIP+8(%rsp) je .Lbstep_iret - cmpq$.Lgs_change, RIP+8(%rsp) + leaq.Lgs_change(%rip), %rcx + cmpq%rcx, RIP+8(%rsp) jne .Lerror_entry_done /* @@ -1482,10 +1483,10 @@ ENTRY(nmi) * resume the outer NMI. */ - movq$repeat_nmi, %rdx + leaqrepeat_nmi(%rip), %rdx cmpq8(%rsp), %rdx ja 1f - movq$end_repeat_nmi, %rdx + leaqend_repeat_nmi(%rip), %rdx cmpq8(%rsp), %rdx ja nested_nmi_out 1: @@ -1539,7 +1540,8 @@ nested_nmi: pushq %rdx pushfq pushq $__KERNEL_CS - pushq $repeat_nmi + leaqrepeat_nmi(%rip), %rdx + pushq %rdx /* Put stack back */ addq$(6*8), %rsp @@ -1578,7 +1580,11 @@ first_nmi: addq$8, (%rsp) /* Fix up RSP */ pushfq /* RFLAGS */ pushq $__KERNEL_CS/* CS */ - pushq $1f /* RIP */ + pushq $0 /* Futur return address */ + pushq %rax/* Save RAX */ + leaq1f(%rip), %rax /* RIP */ + movq%rax, 8(%rsp) /* Put 1f on return address */ + popq%rax/* Restore RAX */ iretq /* continues at repeat_nmi below */ UNWIND_HINT_IRET_REGS 1: diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index a7227dfe1a2b..0c0fc259a4e2 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -208,11 +208,9 @@ identity_mapped: movq%rax, %cr3 lea PAGE_SIZE(%r8), %rsp callswap_pages - jmp *virtual_mapped_addr(%rip) - - /* Absolute value for PIE support */ -virtual_mapped_addr: - .quad virtual_mapped + movabsq $virtual_mapped, %rax + pushq %rax + ret virtual_mapped: movqRSP(%r8), %rsp -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 12/27] x86/paravirt: Adapt assembly for PIE support
if PIE is enabled, switch the paravirt assembly constraints to be compatible. The %c/i constrains generate smaller code so is kept by default. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/paravirt_types.h | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 180bc0bff0fb..140747a98d94 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -337,9 +337,17 @@ extern struct pv_lock_ops pv_lock_ops; #define PARAVIRT_PATCH(x) \ (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) +#ifdef CONFIG_X86_PIE +#define paravirt_opptr_call "a" +#define paravirt_opptr_type "p" +#else +#define paravirt_opptr_call "c" +#define paravirt_opptr_type "i" +#endif + #define paravirt_type(op) \ [paravirt_typenum] "i" (PARAVIRT_PATCH(op)),\ - [paravirt_opptr] "i" (&(op)) + [paravirt_opptr] paravirt_opptr_type (&(op)) #define paravirt_clobber(clobber) \ [paravirt_clobber] "i" (clobber) @@ -395,7 +403,7 @@ int paravirt_disable_iospace(void); */ #define PARAVIRT_CALL \ ANNOTATE_RETPOLINE_SAFE \ - "call *%c[paravirt_opptr];" + "call *%" paravirt_opptr_call "[paravirt_opptr];" /* * These macros are intended to wrap calls through one of the paravirt -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 06/27] x86/entry/64: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/entry/entry_64.S| 18 -- arch/x86/kernel/relocate_kernel_64.S | 8 +++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c6f3677e6105..e738d8d0e308 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -191,7 +191,7 @@ ENTRY(entry_SYSCALL_64_trampoline) * spill RDI and restore it in a second-stage trampoline. */ pushq %rdi - movq$entry_SYSCALL_64_stage2, %rdi + movabsq $entry_SYSCALL_64_stage2, %rdi JMP_NOSPEC %rdi END(entry_SYSCALL_64_trampoline) @@ -1277,7 +1277,8 @@ ENTRY(error_entry) movl%ecx, %eax /* zero extend */ cmpq%rax, RIP+8(%rsp) je .Lbstep_iret - cmpq$.Lgs_change, RIP+8(%rsp) + leaq.Lgs_change(%rip), %rcx + cmpq%rcx, RIP+8(%rsp) jne .Lerror_entry_done /* @@ -1482,10 +1483,10 @@ ENTRY(nmi) * resume the outer NMI. */ - movq$repeat_nmi, %rdx + leaqrepeat_nmi(%rip), %rdx cmpq8(%rsp), %rdx ja 1f - movq$end_repeat_nmi, %rdx + leaqend_repeat_nmi(%rip), %rdx cmpq8(%rsp), %rdx ja nested_nmi_out 1: @@ -1539,7 +1540,8 @@ nested_nmi: pushq %rdx pushfq pushq $__KERNEL_CS - pushq $repeat_nmi + leaqrepeat_nmi(%rip), %rdx + pushq %rdx /* Put stack back */ addq$(6*8), %rsp @@ -1578,7 +1580,11 @@ first_nmi: addq$8, (%rsp) /* Fix up RSP */ pushfq /* RFLAGS */ pushq $__KERNEL_CS/* CS */ - pushq $1f /* RIP */ + pushq $0 /* Futur return address */ + pushq %rax/* Save RAX */ + leaq1f(%rip), %rax /* RIP */ + movq%rax, 8(%rsp) /* Put 1f on return address */ + popq%rax/* Restore RAX */ iretq /* continues at repeat_nmi below */ UNWIND_HINT_IRET_REGS 1: diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index a7227dfe1a2b..0c0fc259a4e2 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -208,11 +208,9 @@ identity_mapped: movq%rax, %cr3 lea PAGE_SIZE(%r8), %rsp callswap_pages - jmp *virtual_mapped_addr(%rip) - - /* Absolute value for PIE support */ -virtual_mapped_addr: - .quad virtual_mapped + movabsq $virtual_mapped, %rax + pushq %rax + ret virtual_mapped: movqRSP(%r8), %rsp -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 12/27] x86/paravirt: Adapt assembly for PIE support
if PIE is enabled, switch the paravirt assembly constraints to be compatible. The %c/i constrains generate smaller code so is kept by default. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/paravirt_types.h | 12 ++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 180bc0bff0fb..140747a98d94 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -337,9 +337,17 @@ extern struct pv_lock_ops pv_lock_ops; #define PARAVIRT_PATCH(x) \ (offsetof(struct paravirt_patch_template, x) / sizeof(void *)) +#ifdef CONFIG_X86_PIE +#define paravirt_opptr_call "a" +#define paravirt_opptr_type "p" +#else +#define paravirt_opptr_call "c" +#define paravirt_opptr_type "i" +#endif + #define paravirt_type(op) \ [paravirt_typenum] "i" (PARAVIRT_PATCH(op)),\ - [paravirt_opptr] "i" (&(op)) + [paravirt_opptr] paravirt_opptr_type (&(op)) #define paravirt_clobber(clobber) \ [paravirt_clobber] "i" (clobber) @@ -395,7 +403,7 @@ int paravirt_disable_iospace(void); */ #define PARAVIRT_CALL \ ANNOTATE_RETPOLINE_SAFE \ - "call *%c[paravirt_opptr];" + "call *%" paravirt_opptr_call "[paravirt_opptr];" /* * These macros are intended to wrap calls through one of the paravirt -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 10/27] x86/boot/64: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Early at boot, the kernel is mapped at a temporary address while preparing the page table. To know the changes needed for the page table with KASLR, the boot code calculate the difference between the expected address of the kernel and the one chosen by KASLR. It does not work with PIE because all symbols in code are relatives. Instead of getting the future relocated virtual address, you will get the current temporary mapping. Instructions were changed to have absolute 64-bit references. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/head_64.S | 16 ++-- 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 8344dd2f310a..7fca19e1f556 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -89,8 +89,10 @@ startup_64: popq%rsi /* Form the CR3 value being sure to include the CR3 modifier */ - addq$(early_top_pgt - __START_KERNEL_map), %rax + movabs $(early_top_pgt - __START_KERNEL_map), %rcx + addq%rcx, %rax jmp 1f + ENTRY(secondary_startup_64) UNWIND_HINT_EMPTY /* @@ -119,7 +121,8 @@ ENTRY(secondary_startup_64) popq%rsi /* Form the CR3 value being sure to include the CR3 modifier */ - addq$(init_top_pgt - __START_KERNEL_map), %rax + movabs $(init_top_pgt - __START_KERNEL_map), %rcx + addq%rcx, %rax 1: /* Enable PAE mode, PGE and LA57 */ @@ -137,7 +140,7 @@ ENTRY(secondary_startup_64) movq%rax, %cr3 /* Ensure I am executing from virtual addresses */ - movq$1f, %rax + movabs $1f, %rax ANNOTATE_RETPOLINE_SAFE jmp *%rax 1: @@ -234,11 +237,12 @@ ENTRY(secondary_startup_64) * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, * address given in m16:64. */ - pushq $.Lafter_lret # put return address on stack for unwinder + movabs $.Lafter_lret, %rax + pushq %rax# put return address on stack for unwinder xorq%rbp, %rbp # clear frame pointer - movqinitial_code(%rip), %rax + leaqinitial_code(%rip), %rax pushq $__KERNEL_CS# set correct cs - pushq %rax# target address in negative space + pushq (%rax) # target address in negative space lretq .Lafter_lret: END(secondary_startup_64) -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 09/27] x86/acpi: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier Acked-by: Pavel Machek Acked-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/wakeup_64.S | 31 --- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 50b8ed0317a3..472659c0f811 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -14,7 +14,7 @@ * Hooray, we are in Long 64-bit mode (but still running in low memory) */ ENTRY(wakeup_long64) - movqsaved_magic, %rax + movqsaved_magic(%rip), %rax movq$0x123456789abcdef0, %rdx cmpq%rdx, %rax jne bogus_64_magic @@ -25,14 +25,14 @@ ENTRY(wakeup_long64) movw%ax, %es movw%ax, %fs movw%ax, %gs - movqsaved_rsp, %rsp + movqsaved_rsp(%rip), %rsp - movqsaved_rbx, %rbx - movqsaved_rdi, %rdi - movqsaved_rsi, %rsi - movqsaved_rbp, %rbp + movqsaved_rbx(%rip), %rbx + movqsaved_rdi(%rip), %rdi + movqsaved_rsi(%rip), %rsi + movqsaved_rbp(%rip), %rbp - movqsaved_rip, %rax + movqsaved_rip(%rip), %rax jmp *%rax ENDPROC(wakeup_long64) @@ -45,7 +45,7 @@ ENTRY(do_suspend_lowlevel) xorl%eax, %eax callsave_processor_state - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movq%rsp, pt_regs_sp(%rax) movq%rbp, pt_regs_bp(%rax) movq%rsi, pt_regs_si(%rax) @@ -64,13 +64,14 @@ ENTRY(do_suspend_lowlevel) pushfq popqpt_regs_flags(%rax) - movq$.Lresume_point, saved_rip(%rip) + leaq.Lresume_point(%rip), %rax + movq%rax, saved_rip(%rip) - movq%rsp, saved_rsp - movq%rbp, saved_rbp - movq%rbx, saved_rbx - movq%rdi, saved_rdi - movq%rsi, saved_rsi + movq%rsp, saved_rsp(%rip) + movq%rbp, saved_rbp(%rip) + movq%rbx, saved_rbx(%rip) + movq%rdi, saved_rdi(%rip) + movq%rsi, saved_rsi(%rip) addq$8, %rsp movl$3, %edi @@ -82,7 +83,7 @@ ENTRY(do_suspend_lowlevel) .align 4 .Lresume_point: /* We don't restore %rax, it must be 0 anyway */ - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movqsaved_context_cr4(%rax), %rbx movq%rbx, %cr4 movqsaved_context_cr3(%rax), %rbx -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 10/27] x86/boot/64: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Early at boot, the kernel is mapped at a temporary address while preparing the page table. To know the changes needed for the page table with KASLR, the boot code calculate the difference between the expected address of the kernel and the one chosen by KASLR. It does not work with PIE because all symbols in code are relatives. Instead of getting the future relocated virtual address, you will get the current temporary mapping. Instructions were changed to have absolute 64-bit references. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/head_64.S | 16 ++-- 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 8344dd2f310a..7fca19e1f556 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -89,8 +89,10 @@ startup_64: popq%rsi /* Form the CR3 value being sure to include the CR3 modifier */ - addq$(early_top_pgt - __START_KERNEL_map), %rax + movabs $(early_top_pgt - __START_KERNEL_map), %rcx + addq%rcx, %rax jmp 1f + ENTRY(secondary_startup_64) UNWIND_HINT_EMPTY /* @@ -119,7 +121,8 @@ ENTRY(secondary_startup_64) popq%rsi /* Form the CR3 value being sure to include the CR3 modifier */ - addq$(init_top_pgt - __START_KERNEL_map), %rax + movabs $(init_top_pgt - __START_KERNEL_map), %rcx + addq%rcx, %rax 1: /* Enable PAE mode, PGE and LA57 */ @@ -137,7 +140,7 @@ ENTRY(secondary_startup_64) movq%rax, %cr3 /* Ensure I am executing from virtual addresses */ - movq$1f, %rax + movabs $1f, %rax ANNOTATE_RETPOLINE_SAFE jmp *%rax 1: @@ -234,11 +237,12 @@ ENTRY(secondary_startup_64) * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, * address given in m16:64. */ - pushq $.Lafter_lret # put return address on stack for unwinder + movabs $.Lafter_lret, %rax + pushq %rax# put return address on stack for unwinder xorq%rbp, %rbp # clear frame pointer - movqinitial_code(%rip), %rax + leaqinitial_code(%rip), %rax pushq $__KERNEL_CS# set correct cs - pushq %rax# target address in negative space + pushq (%rax) # target address in negative space lretq .Lafter_lret: END(secondary_startup_64) -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 09/27] x86/acpi: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier Acked-by: Pavel Machek Acked-by: Rafael J. Wysocki --- arch/x86/kernel/acpi/wakeup_64.S | 31 --- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 50b8ed0317a3..472659c0f811 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -14,7 +14,7 @@ * Hooray, we are in Long 64-bit mode (but still running in low memory) */ ENTRY(wakeup_long64) - movqsaved_magic, %rax + movqsaved_magic(%rip), %rax movq$0x123456789abcdef0, %rdx cmpq%rdx, %rax jne bogus_64_magic @@ -25,14 +25,14 @@ ENTRY(wakeup_long64) movw%ax, %es movw%ax, %fs movw%ax, %gs - movqsaved_rsp, %rsp + movqsaved_rsp(%rip), %rsp - movqsaved_rbx, %rbx - movqsaved_rdi, %rdi - movqsaved_rsi, %rsi - movqsaved_rbp, %rbp + movqsaved_rbx(%rip), %rbx + movqsaved_rdi(%rip), %rdi + movqsaved_rsi(%rip), %rsi + movqsaved_rbp(%rip), %rbp - movqsaved_rip, %rax + movqsaved_rip(%rip), %rax jmp *%rax ENDPROC(wakeup_long64) @@ -45,7 +45,7 @@ ENTRY(do_suspend_lowlevel) xorl%eax, %eax callsave_processor_state - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movq%rsp, pt_regs_sp(%rax) movq%rbp, pt_regs_bp(%rax) movq%rsi, pt_regs_si(%rax) @@ -64,13 +64,14 @@ ENTRY(do_suspend_lowlevel) pushfq popqpt_regs_flags(%rax) - movq$.Lresume_point, saved_rip(%rip) + leaq.Lresume_point(%rip), %rax + movq%rax, saved_rip(%rip) - movq%rsp, saved_rsp - movq%rbp, saved_rbp - movq%rbx, saved_rbx - movq%rdi, saved_rdi - movq%rsi, saved_rsi + movq%rsp, saved_rsp(%rip) + movq%rbp, saved_rbp(%rip) + movq%rbx, saved_rbx(%rip) + movq%rdi, saved_rdi(%rip) + movq%rsi, saved_rsi(%rip) addq$8, %rsp movl$3, %edi @@ -82,7 +83,7 @@ ENTRY(do_suspend_lowlevel) .align 4 .Lresume_point: /* We don't restore %rax, it must be 0 anyway */ - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movqsaved_context_cr4(%rax), %rbx movq%rbx, %cr4 movqsaved_context_cr3(%rax), %rbx -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 04/27] x86: Add macro to get symbol address for PIE support
Add a new _ASM_MOVABS macro to fetch a symbol address. It will be used to replace "_ASM_MOV $, %dst" code construct that are not compatible with PIE. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/asm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 219faaec51df..4492a35fad69 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -30,6 +30,7 @@ #define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) #define _ASM_MOV __ASM_SIZE(mov) +#define _ASM_MOVABS__ASM_SEL(movl, movabsq) #define _ASM_INC __ASM_SIZE(inc) #define _ASM_DEC __ASM_SIZE(dec) #define _ASM_ADD __ASM_SIZE(add) -- 2.18.0.rc2.346.g013aa6912e-goog
[PATCH v5 04/27] x86: Add macro to get symbol address for PIE support
Add a new _ASM_MOVABS macro to fetch a symbol address. It will be used to replace "_ASM_MOV $, %dst" code construct that are not compatible with PIE. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/asm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 219faaec51df..4492a35fad69 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -30,6 +30,7 @@ #define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) #define _ASM_MOV __ASM_SIZE(mov) +#define _ASM_MOVABS__ASM_SEL(movl, movabsq) #define _ASM_INC __ASM_SIZE(inc) #define _ASM_DEC __ASM_SIZE(dec) #define _ASM_ADD __ASM_SIZE(add) -- 2.18.0.rc2.346.g013aa6912e-goog
Re: [PATCH v4 21/27] x86/ftrace: Adapt function tracing for PIE support
On Tue, Jun 5, 2018 at 9:56 AM Thomas Garnier wrote: > > On Mon, Jun 4, 2018 at 2:44 PM Steven Rostedt wrote: > > > > On Mon, 4 Jun 2018 14:06:03 -0700 > > Thomas Garnier wrote: > > > > > On Mon, Jun 4, 2018 at 1:16 PM Steven Rostedt wrote: > > > > > > > > On Tue, 29 May 2018 15:15:22 -0700 > > > > Thomas Garnier wrote: > > > > > > > > > When using -fPIE/PIC with function tracing, the compiler generates a > > > > > call through the GOT (call *__fentry__@GOTPCREL). This instruction > > > > > takes 6 bytes instead of 5 on the usual relative call. > > > > > > > > > > If PIE is enabled, replace the 6th byte of the GOT call by a 1-byte > > > > > nop > > > > > so ftrace can handle the previous 5-bytes as before. > > > > > > > > > > Position Independent Executable (PIE) support will allow to extend the > > > > > KASLR randomization range 0x8000. > > > > > > > > I thought you were going to write a update to recordmcount.c to handle > > > > this at compile time? > > > > > > I can correctly calculate the start of the call instruction with > > > recordmcount (no need for addr-1) but I still need to handle the > > > different size of the instructions. I don't think I can completely > > > replace the GOT call with a relative call. Maybe I am missing > > > something on the way recordmcount is used? Should it replace all > > > mcount locations with a nop slide? Why is it done at runtime too then? > > > > Because we need to figure out the "ideal nop" thus we need to change it > > regardless. > > I see what you mean looking at the different ideal_nops based on > configurations. > > > > > We could have recordmcount.c replace everything with the default nop > > (I've thought of that before), and then we could update with the ideal > > nop at run time, if that helps with this. > > I don't think that's necessary. In proposed implementation of PIE, > kernel modules would not use a GOT call. In the current implementation > the __fentry__ call is always GOT based (6-bytes). I will simplify the > runtime implementation in the next patch set to just swap the expected > size and ideal_nop when PIE is enabled. Actually moving the logic from 5-bytes to 6-bytes is much more complicated, that's why I went with this approach before. I don't think it can be improved much more beyond creating a nop slide in mrecordcount but that's a different approach. I will clean-up the code a bit for the next iteration but that's about it. Let me know what you think. > > > > > -- Steve > > > > -- > Thomas -- Thomas
Re: [PATCH v4 21/27] x86/ftrace: Adapt function tracing for PIE support
On Tue, Jun 5, 2018 at 9:56 AM Thomas Garnier wrote: > > On Mon, Jun 4, 2018 at 2:44 PM Steven Rostedt wrote: > > > > On Mon, 4 Jun 2018 14:06:03 -0700 > > Thomas Garnier wrote: > > > > > On Mon, Jun 4, 2018 at 1:16 PM Steven Rostedt wrote: > > > > > > > > On Tue, 29 May 2018 15:15:22 -0700 > > > > Thomas Garnier wrote: > > > > > > > > > When using -fPIE/PIC with function tracing, the compiler generates a > > > > > call through the GOT (call *__fentry__@GOTPCREL). This instruction > > > > > takes 6 bytes instead of 5 on the usual relative call. > > > > > > > > > > If PIE is enabled, replace the 6th byte of the GOT call by a 1-byte > > > > > nop > > > > > so ftrace can handle the previous 5-bytes as before. > > > > > > > > > > Position Independent Executable (PIE) support will allow to extend the > > > > > KASLR randomization range 0x8000. > > > > > > > > I thought you were going to write a update to recordmcount.c to handle > > > > this at compile time? > > > > > > I can correctly calculate the start of the call instruction with > > > recordmcount (no need for addr-1) but I still need to handle the > > > different size of the instructions. I don't think I can completely > > > replace the GOT call with a relative call. Maybe I am missing > > > something on the way recordmcount is used? Should it replace all > > > mcount locations with a nop slide? Why is it done at runtime too then? > > > > Because we need to figure out the "ideal nop" thus we need to change it > > regardless. > > I see what you mean looking at the different ideal_nops based on > configurations. > > > > > We could have recordmcount.c replace everything with the default nop > > (I've thought of that before), and then we could update with the ideal > > nop at run time, if that helps with this. > > I don't think that's necessary. In proposed implementation of PIE, > kernel modules would not use a GOT call. In the current implementation > the __fentry__ call is always GOT based (6-bytes). I will simplify the > runtime implementation in the next patch set to just swap the expected > size and ideal_nop when PIE is enabled. Actually moving the logic from 5-bytes to 6-bytes is much more complicated, that's why I went with this approach before. I don't think it can be improved much more beyond creating a nop slide in mrecordcount but that's a different approach. I will clean-up the code a bit for the next iteration but that's about it. Let me know what you think. > > > > > -- Steve > > > > -- > Thomas -- Thomas
Re: [PATCH v4 21/27] x86/ftrace: Adapt function tracing for PIE support
On Mon, Jun 4, 2018 at 2:44 PM Steven Rostedt wrote: > > On Mon, 4 Jun 2018 14:06:03 -0700 > Thomas Garnier wrote: > > > On Mon, Jun 4, 2018 at 1:16 PM Steven Rostedt wrote: > > > > > > On Tue, 29 May 2018 15:15:22 -0700 > > > Thomas Garnier wrote: > > > > > > > When using -fPIE/PIC with function tracing, the compiler generates a > > > > call through the GOT (call *__fentry__@GOTPCREL). This instruction > > > > takes 6 bytes instead of 5 on the usual relative call. > > > > > > > > If PIE is enabled, replace the 6th byte of the GOT call by a 1-byte nop > > > > so ftrace can handle the previous 5-bytes as before. > > > > > > > > Position Independent Executable (PIE) support will allow to extend the > > > > KASLR randomization range 0x8000. > > > > > > I thought you were going to write a update to recordmcount.c to handle > > > this at compile time? > > > > I can correctly calculate the start of the call instruction with > > recordmcount (no need for addr-1) but I still need to handle the > > different size of the instructions. I don't think I can completely > > replace the GOT call with a relative call. Maybe I am missing > > something on the way recordmcount is used? Should it replace all > > mcount locations with a nop slide? Why is it done at runtime too then? > > Because we need to figure out the "ideal nop" thus we need to change it > regardless. I see what you mean looking at the different ideal_nops based on configurations. > > We could have recordmcount.c replace everything with the default nop > (I've thought of that before), and then we could update with the ideal > nop at run time, if that helps with this. I don't think that's necessary. In proposed implementation of PIE, kernel modules would not use a GOT call. In the current implementation the __fentry__ call is always GOT based (6-bytes). I will simplify the runtime implementation in the next patch set to just swap the expected size and ideal_nop when PIE is enabled. > > -- Steve -- Thomas
Re: [PATCH v4 21/27] x86/ftrace: Adapt function tracing for PIE support
On Mon, Jun 4, 2018 at 2:44 PM Steven Rostedt wrote: > > On Mon, 4 Jun 2018 14:06:03 -0700 > Thomas Garnier wrote: > > > On Mon, Jun 4, 2018 at 1:16 PM Steven Rostedt wrote: > > > > > > On Tue, 29 May 2018 15:15:22 -0700 > > > Thomas Garnier wrote: > > > > > > > When using -fPIE/PIC with function tracing, the compiler generates a > > > > call through the GOT (call *__fentry__@GOTPCREL). This instruction > > > > takes 6 bytes instead of 5 on the usual relative call. > > > > > > > > If PIE is enabled, replace the 6th byte of the GOT call by a 1-byte nop > > > > so ftrace can handle the previous 5-bytes as before. > > > > > > > > Position Independent Executable (PIE) support will allow to extend the > > > > KASLR randomization range 0x8000. > > > > > > I thought you were going to write a update to recordmcount.c to handle > > > this at compile time? > > > > I can correctly calculate the start of the call instruction with > > recordmcount (no need for addr-1) but I still need to handle the > > different size of the instructions. I don't think I can completely > > replace the GOT call with a relative call. Maybe I am missing > > something on the way recordmcount is used? Should it replace all > > mcount locations with a nop slide? Why is it done at runtime too then? > > Because we need to figure out the "ideal nop" thus we need to change it > regardless. I see what you mean looking at the different ideal_nops based on configurations. > > We could have recordmcount.c replace everything with the default nop > (I've thought of that before), and then we could update with the ideal > nop at run time, if that helps with this. I don't think that's necessary. In proposed implementation of PIE, kernel modules would not use a GOT call. In the current implementation the __fentry__ call is always GOT based (6-bytes). I will simplify the runtime implementation in the next patch set to just swap the expected size and ideal_nop when PIE is enabled. > > -- Steve -- Thomas
Re: [PATCH v4 21/27] x86/ftrace: Adapt function tracing for PIE support
On Mon, Jun 4, 2018 at 1:16 PM Steven Rostedt wrote: > > On Tue, 29 May 2018 15:15:22 -0700 > Thomas Garnier wrote: > > > When using -fPIE/PIC with function tracing, the compiler generates a > > call through the GOT (call *__fentry__@GOTPCREL). This instruction > > takes 6 bytes instead of 5 on the usual relative call. > > > > If PIE is enabled, replace the 6th byte of the GOT call by a 1-byte nop > > so ftrace can handle the previous 5-bytes as before. > > > > Position Independent Executable (PIE) support will allow to extend the > > KASLR randomization range 0x8000. > > I thought you were going to write a update to recordmcount.c to handle > this at compile time? I can correctly calculate the start of the call instruction with recordmcount (no need for addr-1) but I still need to handle the different size of the instructions. I don't think I can completely replace the GOT call with a relative call. Maybe I am missing something on the way recordmcount is used? Should it replace all mcount locations with a nop slide? Why is it done at runtime too then? > > -- Steve > > > > > Signed-off-by: Thomas Garnier > > --- > > -- Thomas
Re: [PATCH v4 21/27] x86/ftrace: Adapt function tracing for PIE support
On Mon, Jun 4, 2018 at 1:16 PM Steven Rostedt wrote: > > On Tue, 29 May 2018 15:15:22 -0700 > Thomas Garnier wrote: > > > When using -fPIE/PIC with function tracing, the compiler generates a > > call through the GOT (call *__fentry__@GOTPCREL). This instruction > > takes 6 bytes instead of 5 on the usual relative call. > > > > If PIE is enabled, replace the 6th byte of the GOT call by a 1-byte nop > > so ftrace can handle the previous 5-bytes as before. > > > > Position Independent Executable (PIE) support will allow to extend the > > KASLR randomization range 0x8000. > > I thought you were going to write a update to recordmcount.c to handle > this at compile time? I can correctly calculate the start of the call instruction with recordmcount (no need for addr-1) but I still need to handle the different size of the instructions. I don't think I can completely replace the GOT call with a relative call. Maybe I am missing something on the way recordmcount is used? Should it replace all mcount locations with a nop slide? Why is it done at runtime too then? > > -- Steve > > > > > Signed-off-by: Thomas Garnier > > --- > > -- Thomas
[PATCH v4 04/27] x86: Add macro to get symbol address for PIE support
Add a new _ASM_MOVABS macro to fetch a symbol address. It will be used to replace "_ASM_MOV $, %dst" code construct that are not compatible with PIE. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/asm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 219faaec51df..4492a35fad69 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -30,6 +30,7 @@ #define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) #define _ASM_MOV __ASM_SIZE(mov) +#define _ASM_MOVABS__ASM_SEL(movl, movabsq) #define _ASM_INC __ASM_SIZE(inc) #define _ASM_DEC __ASM_SIZE(dec) #define _ASM_ADD __ASM_SIZE(add) -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 04/27] x86: Add macro to get symbol address for PIE support
Add a new _ASM_MOVABS macro to fetch a symbol address. It will be used to replace "_ASM_MOV $, %dst" code construct that are not compatible with PIE. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/asm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index 219faaec51df..4492a35fad69 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -30,6 +30,7 @@ #define _ASM_ALIGN __ASM_SEL(.balign 4, .balign 8) #define _ASM_MOV __ASM_SIZE(mov) +#define _ASM_MOVABS__ASM_SEL(movl, movabsq) #define _ASM_INC __ASM_SIZE(inc) #define _ASM_DEC __ASM_SIZE(dec) #define _ASM_ADD __ASM_SIZE(add) -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 01/27] x86/crypto: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/crypto/aes-x86_64-asm_64.S | 45 + arch/x86/crypto/aesni-intel_asm.S| 8 +- arch/x86/crypto/aesni-intel_avx-x86_64.S | 6 +- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 42 - arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 44 - arch/x86/crypto/camellia-x86_64-asm_64.S | 8 +- arch/x86/crypto/cast5-avx-x86_64-asm_64.S| 50 +- arch/x86/crypto/cast6-avx-x86_64-asm_64.S| 44 + arch/x86/crypto/des3_ede-asm_64.S| 96 +--- arch/x86/crypto/ghash-clmulni-intel_asm.S| 4 +- arch/x86/crypto/glue_helper-asm-avx.S| 4 +- arch/x86/crypto/glue_helper-asm-avx2.S | 6 +- arch/x86/crypto/sha256-avx2-asm.S| 23 +++-- 13 files changed, 221 insertions(+), 159 deletions(-) diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S index 8739cf7795de..86fa068e5e81 100644 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -48,8 +48,12 @@ #define R10%r10 #define R11%r11 +/* Hold global for PIE suport */ +#define RBASE %r12 + #define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ ENTRY(FUNC);\ + pushq RBASE; \ movqr1,r2; \ leaqKEY+48(r8),r9; \ movqr10,r11;\ @@ -74,54 +78,63 @@ movlr6 ## E,4(r9); \ movlr7 ## E,8(r9); \ movlr8 ## E,12(r9); \ + popqRBASE; \ ret;\ ENDPROC(FUNC); +#define round_mov(tab_off, reg_i, reg_o) \ + leaqtab_off(%rip), RBASE; \ + movl(RBASE,reg_i,4), reg_o; + +#define round_xor(tab_off, reg_i, reg_o) \ + leaqtab_off(%rip), RBASE; \ + xorl(RBASE,reg_i,4), reg_o; + #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ movzbl r2 ## H,r5 ## E;\ movzbl r2 ## L,r6 ## E;\ - movlTAB+1024(,r5,4),r5 ## E;\ + round_mov(TAB+1024, r5, r5 ## E)\ movwr4 ## X,r2 ## X;\ - movlTAB(,r6,4),r6 ## E; \ + round_mov(TAB, r6, r6 ## E) \ roll$16,r2 ## E;\ shrl$16,r4 ## E;\ movzbl r4 ## L,r7 ## E;\ movzbl r4 ## H,r4 ## E;\ xorlOFFSET(r8),ra ## E; \ xorlOFFSET+4(r8),rb ## E; \ - xorlTAB+3072(,r4,4),r5 ## E;\ - xorlTAB+2048(,r7,4),r6 ## E;\ + round_xor(TAB+3072, r4, r5 ## E)\ + round_xor(TAB+2048, r7, r6 ## E)\ movzbl r1 ## L,r7 ## E;\ movzbl r1 ## H,r4 ## E;\ - movlTAB+1024(,r4,4),r4 ## E;\ + round_mov(TAB+1024, r4, r4 ## E)\ movwr3 ## X,r1 ## X;\ roll$16,r1 ## E;\ shrl$16,r3 ## E;\ - xorlTAB(,r7,4),r5 ## E; \ + round_xor(TAB, r7, r5 ## E) \ movzbl r3 ## L,r7 ## E;\ movzbl r3 ## H,r3 ## E;\ - xorlTAB+3072(,r3,4),r4 ## E;\ - xorlTAB+2048(,r7,4),r5 ## E;\ + round_xor(TAB+3072, r3, r4 ## E)\ + round_xor(TAB+2048, r7, r5 ## E)\ movzbl r1 ## L,r7 ## E;\ movzbl r1 ## H,r3 ## E;\ shrl$16,r1 ## E;\ - xorlTAB+3072(,r3,4),r6 ## E;\ - movlTAB+2048(,r7,4),r3 ## E;\ + round_xor(TAB+3072, r3, r6 ## E)\ + round_mov(TAB+2048, r7, r3 ## E)\ movzbl r1 ## L,r7 ## E;\ movzbl r1 ## H,r1 ## E;\ - xorlTAB+1024(,r1,4),r6 ## E;\ - xorlTAB(,r7,4),r3 ## E; \ + round_xor(TAB+1024, r1, r6 ## E)\ + round_xor(TAB, r7, r3 ## E) \ movzbl r2 ## H,r1 ## E;\ movzbl r2 ## L,r7 ## E;\ shrl$16,r2 ## E;\ - xorlTAB+3072(,r1,4),r3 ## E;\ - xorlTAB+2048(,r7,4),r4 ## E;\ + round_xor(TAB+3072, r1, r3 ## E)\ + round_xor(TAB+2048, r7, r4 ## E)\ movzbl r2 ## H,r1 ## E;\ movzbl r2 ## L,r2 ## E;\ xorlOFFSET+8(r8),rc ## E; \ xorlOFFSET+12(r8),rd ## E; \ - xorlTAB+1024(,r1,4),r3 ## E;\ - xorlTAB(,r2,4),r4 ## E; + round_xor(TAB+1024, r1, r3 ## E)\ + round_xor(TAB, r2, r4 ## E) #define move_regs(r1,r2,r3,r4) \ movlr3 ## E,r1 ## E;\ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index e762ef417562..4df029aa5fc1 100644 --- a/arch/x86/crypto/aesni
[PATCH v4 01/27] x86/crypto: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/crypto/aes-x86_64-asm_64.S | 45 + arch/x86/crypto/aesni-intel_asm.S| 8 +- arch/x86/crypto/aesni-intel_avx-x86_64.S | 6 +- arch/x86/crypto/camellia-aesni-avx-asm_64.S | 42 - arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 44 - arch/x86/crypto/camellia-x86_64-asm_64.S | 8 +- arch/x86/crypto/cast5-avx-x86_64-asm_64.S| 50 +- arch/x86/crypto/cast6-avx-x86_64-asm_64.S| 44 + arch/x86/crypto/des3_ede-asm_64.S| 96 +--- arch/x86/crypto/ghash-clmulni-intel_asm.S| 4 +- arch/x86/crypto/glue_helper-asm-avx.S| 4 +- arch/x86/crypto/glue_helper-asm-avx2.S | 6 +- arch/x86/crypto/sha256-avx2-asm.S| 23 +++-- 13 files changed, 221 insertions(+), 159 deletions(-) diff --git a/arch/x86/crypto/aes-x86_64-asm_64.S b/arch/x86/crypto/aes-x86_64-asm_64.S index 8739cf7795de..86fa068e5e81 100644 --- a/arch/x86/crypto/aes-x86_64-asm_64.S +++ b/arch/x86/crypto/aes-x86_64-asm_64.S @@ -48,8 +48,12 @@ #define R10%r10 #define R11%r11 +/* Hold global for PIE suport */ +#define RBASE %r12 + #define prologue(FUNC,KEY,B128,B192,r1,r2,r5,r6,r7,r8,r9,r10,r11) \ ENTRY(FUNC);\ + pushq RBASE; \ movqr1,r2; \ leaqKEY+48(r8),r9; \ movqr10,r11;\ @@ -74,54 +78,63 @@ movlr6 ## E,4(r9); \ movlr7 ## E,8(r9); \ movlr8 ## E,12(r9); \ + popqRBASE; \ ret;\ ENDPROC(FUNC); +#define round_mov(tab_off, reg_i, reg_o) \ + leaqtab_off(%rip), RBASE; \ + movl(RBASE,reg_i,4), reg_o; + +#define round_xor(tab_off, reg_i, reg_o) \ + leaqtab_off(%rip), RBASE; \ + xorl(RBASE,reg_i,4), reg_o; + #define round(TAB,OFFSET,r1,r2,r3,r4,r5,r6,r7,r8,ra,rb,rc,rd) \ movzbl r2 ## H,r5 ## E;\ movzbl r2 ## L,r6 ## E;\ - movlTAB+1024(,r5,4),r5 ## E;\ + round_mov(TAB+1024, r5, r5 ## E)\ movwr4 ## X,r2 ## X;\ - movlTAB(,r6,4),r6 ## E; \ + round_mov(TAB, r6, r6 ## E) \ roll$16,r2 ## E;\ shrl$16,r4 ## E;\ movzbl r4 ## L,r7 ## E;\ movzbl r4 ## H,r4 ## E;\ xorlOFFSET(r8),ra ## E; \ xorlOFFSET+4(r8),rb ## E; \ - xorlTAB+3072(,r4,4),r5 ## E;\ - xorlTAB+2048(,r7,4),r6 ## E;\ + round_xor(TAB+3072, r4, r5 ## E)\ + round_xor(TAB+2048, r7, r6 ## E)\ movzbl r1 ## L,r7 ## E;\ movzbl r1 ## H,r4 ## E;\ - movlTAB+1024(,r4,4),r4 ## E;\ + round_mov(TAB+1024, r4, r4 ## E)\ movwr3 ## X,r1 ## X;\ roll$16,r1 ## E;\ shrl$16,r3 ## E;\ - xorlTAB(,r7,4),r5 ## E; \ + round_xor(TAB, r7, r5 ## E) \ movzbl r3 ## L,r7 ## E;\ movzbl r3 ## H,r3 ## E;\ - xorlTAB+3072(,r3,4),r4 ## E;\ - xorlTAB+2048(,r7,4),r5 ## E;\ + round_xor(TAB+3072, r3, r4 ## E)\ + round_xor(TAB+2048, r7, r5 ## E)\ movzbl r1 ## L,r7 ## E;\ movzbl r1 ## H,r3 ## E;\ shrl$16,r1 ## E;\ - xorlTAB+3072(,r3,4),r6 ## E;\ - movlTAB+2048(,r7,4),r3 ## E;\ + round_xor(TAB+3072, r3, r6 ## E)\ + round_mov(TAB+2048, r7, r3 ## E)\ movzbl r1 ## L,r7 ## E;\ movzbl r1 ## H,r1 ## E;\ - xorlTAB+1024(,r1,4),r6 ## E;\ - xorlTAB(,r7,4),r3 ## E; \ + round_xor(TAB+1024, r1, r6 ## E)\ + round_xor(TAB, r7, r3 ## E) \ movzbl r2 ## H,r1 ## E;\ movzbl r2 ## L,r7 ## E;\ shrl$16,r2 ## E;\ - xorlTAB+3072(,r1,4),r3 ## E;\ - xorlTAB+2048(,r7,4),r4 ## E;\ + round_xor(TAB+3072, r1, r3 ## E)\ + round_xor(TAB+2048, r7, r4 ## E)\ movzbl r2 ## H,r1 ## E;\ movzbl r2 ## L,r2 ## E;\ xorlOFFSET+8(r8),rc ## E; \ xorlOFFSET+12(r8),rd ## E; \ - xorlTAB+1024(,r1,4),r3 ## E;\ - xorlTAB(,r2,4),r4 ## E; + round_xor(TAB+1024, r1, r3 ## E)\ + round_xor(TAB, r2, r4 ## E) #define move_regs(r1,r2,r3,r4) \ movlr3 ## E,r1 ## E;\ diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S index e762ef417562..4df029aa5fc1 100644 --- a/arch/x86/crypto/aesni
[PATCH v4 05/27] x86: relocate_kernel - Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/relocate_kernel_64.S | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 11eda21eb697..a7227dfe1a2b 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -208,9 +208,11 @@ identity_mapped: movq%rax, %cr3 lea PAGE_SIZE(%r8), %rsp callswap_pages - movq$virtual_mapped, %rax - pushq %rax - ret + jmp *virtual_mapped_addr(%rip) + + /* Absolute value for PIE support */ +virtual_mapped_addr: + .quad virtual_mapped virtual_mapped: movqRSP(%r8), %rsp -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 05/27] x86: relocate_kernel - Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/relocate_kernel_64.S | 8 +--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 11eda21eb697..a7227dfe1a2b 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -208,9 +208,11 @@ identity_mapped: movq%rax, %cr3 lea PAGE_SIZE(%r8), %rsp callswap_pages - movq$virtual_mapped, %rax - pushq %rax - ret + jmp *virtual_mapped_addr(%rip) + + /* Absolute value for PIE support */ +virtual_mapped_addr: + .quad virtual_mapped virtual_mapped: movqRSP(%r8), %rsp -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 03/27] x86: Use symbol name in jump table for PIE support
Replace the %c constraint with %P. The %c is incompatible with PIE because it implies an immediate value whereas %P reference a symbol. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/jump_label.h | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 8c0de4282659..dfdcdc39604a 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -37,9 +37,9 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool bran ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" - _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" + _ASM_PTR "1b, %l[l_yes], %P0 \n\t" ".popsection \n\t" - : : "i" (key), "i" (branch) : : l_yes); + : : "X" (&((char *)key)[branch]) : : l_yes); return false; l_yes: @@ -53,9 +53,9 @@ static __always_inline bool arch_static_branch_jump(struct static_key *key, bool "2:\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" - _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" + _ASM_PTR "1b, %l[l_yes], %P0 \n\t" ".popsection \n\t" - : : "i" (key), "i" (branch) : : l_yes); + : : "X" (&((char *)key)[branch]) : : l_yes); return false; l_yes: -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 02/27] x86: Use symbol name on bug table for PIE support
Replace the %c constraint with %P. The %c is incompatible with PIE because it implies an immediate value whereas %P reference a symbol. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/bug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 6804d6642767..3d690a4abf50 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -35,7 +35,7 @@ do { \ asm volatile("1:\t" ins "\n"\ ".pushsection __bug_table,\"aw\"\n"\ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ -"\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ +"\t" __BUG_REL(%P0) "\t# bug_entry::file\n" \ "\t.word %c1""\t# bug_entry::line\n" \ "\t.word %c2""\t# bug_entry::flags\n" \ "\t.org 2b+%c3\n" \ -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 03/27] x86: Use symbol name in jump table for PIE support
Replace the %c constraint with %P. The %c is incompatible with PIE because it implies an immediate value whereas %P reference a symbol. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/jump_label.h | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 8c0de4282659..dfdcdc39604a 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -37,9 +37,9 @@ static __always_inline bool arch_static_branch(struct static_key *key, bool bran ".byte " __stringify(STATIC_KEY_INIT_NOP) "\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" - _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" + _ASM_PTR "1b, %l[l_yes], %P0 \n\t" ".popsection \n\t" - : : "i" (key), "i" (branch) : : l_yes); + : : "X" (&((char *)key)[branch]) : : l_yes); return false; l_yes: @@ -53,9 +53,9 @@ static __always_inline bool arch_static_branch_jump(struct static_key *key, bool "2:\n\t" ".pushsection __jump_table, \"aw\" \n\t" _ASM_ALIGN "\n\t" - _ASM_PTR "1b, %l[l_yes], %c0 + %c1 \n\t" + _ASM_PTR "1b, %l[l_yes], %P0 \n\t" ".popsection \n\t" - : : "i" (key), "i" (branch) : : l_yes); + : : "X" (&((char *)key)[branch]) : : l_yes); return false; l_yes: -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 02/27] x86: Use symbol name on bug table for PIE support
Replace the %c constraint with %P. The %c is incompatible with PIE because it implies an immediate value whereas %P reference a symbol. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/bug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 6804d6642767..3d690a4abf50 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -35,7 +35,7 @@ do { \ asm volatile("1:\t" ins "\n"\ ".pushsection __bug_table,\"aw\"\n"\ "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ -"\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ +"\t" __BUG_REL(%P0) "\t# bug_entry::file\n" \ "\t.word %c1""\t# bug_entry::line\n" \ "\t.word %c2""\t# bug_entry::flags\n" \ "\t.org 2b+%c3\n" \ -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 10/27] x86/boot/64: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Early at boot, the kernel is mapped at a temporary address while preparing the page table. To know the changes needed for the page table with KASLR, the boot code calculate the difference between the expected address of the kernel and the one chosen by KASLR. It does not work with PIE because all symbols in code are relatives. Instead of getting the future relocated virtual address, you will get the current temporary mapping. Instructions were changed to have absolute 64-bit references. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/head_64.S | 16 ++-- 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 8344dd2f310a..7fca19e1f556 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -89,8 +89,10 @@ startup_64: popq%rsi /* Form the CR3 value being sure to include the CR3 modifier */ - addq$(early_top_pgt - __START_KERNEL_map), %rax + movabs $(early_top_pgt - __START_KERNEL_map), %rcx + addq%rcx, %rax jmp 1f + ENTRY(secondary_startup_64) UNWIND_HINT_EMPTY /* @@ -119,7 +121,8 @@ ENTRY(secondary_startup_64) popq%rsi /* Form the CR3 value being sure to include the CR3 modifier */ - addq$(init_top_pgt - __START_KERNEL_map), %rax + movabs $(init_top_pgt - __START_KERNEL_map), %rcx + addq%rcx, %rax 1: /* Enable PAE mode, PGE and LA57 */ @@ -137,7 +140,7 @@ ENTRY(secondary_startup_64) movq%rax, %cr3 /* Ensure I am executing from virtual addresses */ - movq$1f, %rax + movabs $1f, %rax ANNOTATE_RETPOLINE_SAFE jmp *%rax 1: @@ -234,11 +237,12 @@ ENTRY(secondary_startup_64) * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, * address given in m16:64. */ - pushq $.Lafter_lret # put return address on stack for unwinder + movabs $.Lafter_lret, %rax + pushq %rax# put return address on stack for unwinder xorq%rbp, %rbp # clear frame pointer - movqinitial_code(%rip), %rax + leaqinitial_code(%rip), %rax pushq $__KERNEL_CS# set correct cs - pushq %rax# target address in negative space + pushq (%rax) # target address in negative space lretq .Lafter_lret: END(secondary_startup_64) -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 10/27] x86/boot/64: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Early at boot, the kernel is mapped at a temporary address while preparing the page table. To know the changes needed for the page table with KASLR, the boot code calculate the difference between the expected address of the kernel and the one chosen by KASLR. It does not work with PIE because all symbols in code are relatives. Instead of getting the future relocated virtual address, you will get the current temporary mapping. Instructions were changed to have absolute 64-bit references. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/head_64.S | 16 ++-- 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 8344dd2f310a..7fca19e1f556 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -89,8 +89,10 @@ startup_64: popq%rsi /* Form the CR3 value being sure to include the CR3 modifier */ - addq$(early_top_pgt - __START_KERNEL_map), %rax + movabs $(early_top_pgt - __START_KERNEL_map), %rcx + addq%rcx, %rax jmp 1f + ENTRY(secondary_startup_64) UNWIND_HINT_EMPTY /* @@ -119,7 +121,8 @@ ENTRY(secondary_startup_64) popq%rsi /* Form the CR3 value being sure to include the CR3 modifier */ - addq$(init_top_pgt - __START_KERNEL_map), %rax + movabs $(init_top_pgt - __START_KERNEL_map), %rcx + addq%rcx, %rax 1: /* Enable PAE mode, PGE and LA57 */ @@ -137,7 +140,7 @@ ENTRY(secondary_startup_64) movq%rax, %cr3 /* Ensure I am executing from virtual addresses */ - movq$1f, %rax + movabs $1f, %rax ANNOTATE_RETPOLINE_SAFE jmp *%rax 1: @@ -234,11 +237,12 @@ ENTRY(secondary_startup_64) * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, * address given in m16:64. */ - pushq $.Lafter_lret # put return address on stack for unwinder + movabs $.Lafter_lret, %rax + pushq %rax# put return address on stack for unwinder xorq%rbp, %rbp # clear frame pointer - movqinitial_code(%rip), %rax + leaqinitial_code(%rip), %rax pushq $__KERNEL_CS# set correct cs - pushq %rax# target address in negative space + pushq (%rax) # target address in negative space lretq .Lafter_lret: END(secondary_startup_64) -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 07/27] x86: pm-trace - Adapt assembly for PIE support
Change assembly to use the new _ASM_MOVABS macro instead of _ASM_MOV for the assembly to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/pm-trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pm-trace.h b/arch/x86/include/asm/pm-trace.h index bfa32aa428e5..972070806ce9 100644 --- a/arch/x86/include/asm/pm-trace.h +++ b/arch/x86/include/asm/pm-trace.h @@ -8,7 +8,7 @@ do { \ if (pm_trace_enabled) { \ const void *tracedata; \ - asm volatile(_ASM_MOV " $1f,%0\n" \ + asm volatile(_ASM_MOVABS " $1f,%0\n"\ ".section .tracedata,\"a\"\n" \ "1:\t.word %c1\n\t"\ _ASM_PTR " %c2\n" \ -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 07/27] x86: pm-trace - Adapt assembly for PIE support
Change assembly to use the new _ASM_MOVABS macro instead of _ASM_MOV for the assembly to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/pm-trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pm-trace.h b/arch/x86/include/asm/pm-trace.h index bfa32aa428e5..972070806ce9 100644 --- a/arch/x86/include/asm/pm-trace.h +++ b/arch/x86/include/asm/pm-trace.h @@ -8,7 +8,7 @@ do { \ if (pm_trace_enabled) { \ const void *tracedata; \ - asm volatile(_ASM_MOV " $1f,%0\n" \ + asm volatile(_ASM_MOVABS " $1f,%0\n"\ ".section .tracedata,\"a\"\n" \ "1:\t.word %c1\n\t"\ _ASM_PTR " %c2\n" \ -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 13/27] x86/boot/64: Build head64.c as mcmodel large when PIE is enabled
The __startup_64 function assumes all symbols have relocated addresses instead of the current boot virtual address. PIE generated code favor relative addresses making all virtual and physical address math incorrect. If PIE is enabled, build head64.c as mcmodel large instead to ensure absolute references on all memory access. Add a global __force_order variable required when using a large model with read_cr* functions. To build head64.c as mcmodel=large, disable the retpoline gcc flags. This code is used at early boot and removed later, it doesn't need retpoline mitigation. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/Makefile | 6 ++ arch/x86/kernel/head64.c | 3 +++ 2 files changed, 9 insertions(+) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 02d6f5cf4e70..0f6da4b216e0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,6 +22,12 @@ CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg endif +ifdef CONFIG_X86_PIE +# Remove PIE and retpoline flags that are incompatible with mcmodel=large +CFLAGS_REMOVE_head64.o += -fPIE -mindirect-branch=thunk-extern -mindirect-branch-register +CFLAGS_head64.o = -mcmodel=large +endif + KASAN_SANITIZE_head$(BITS).o := n KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2d29e47c056e..fa661fb97127 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -64,6 +64,9 @@ EXPORT_SYMBOL(vmemmap_base); #define __head __section(.head.text) +/* Required for read_cr3 when building as PIE */ +unsigned long __force_order; + static void __head *fixup_pointer(void *ptr, unsigned long physaddr) { return ptr - (void *)_text + (void *)physaddr; -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 08/27] x86/CPU: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Use the new _ASM_MOVABS macro instead of the 'mov $symbol, %dst' construct. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/processor.h | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e28add6b791f..7ae9fb91f7b5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -50,7 +50,7 @@ static inline void *current_text_addr(void) { void *pc; - asm volatile("mov $1f, %0; 1:":"=r" (pc)); + asm volatile(_ASM_MOVABS " $1f, %0; 1:":"=r" (pc)); return pc; } @@ -711,6 +711,7 @@ static inline void sync_core(void) : ASM_CALL_CONSTRAINT : : "memory"); #else unsigned int tmp; + unsigned long tmp2; asm volatile ( UNWIND_HINT_SAVE @@ -721,11 +722,13 @@ static inline void sync_core(void) "pushfq\n\t" "mov %%cs, %0\n\t" "pushq %q0\n\t" - "pushq $1f\n\t" + "leaq 1f(%%rip), %1\n\t" + "pushq %1\n\t" "iretq\n\t" UNWIND_HINT_RESTORE "1:" - : "=" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); + : "=" (tmp), "=" (tmp2), ASM_CALL_CONSTRAINT + : : "cc", "memory"); #endif } -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 08/27] x86/CPU: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Use the new _ASM_MOVABS macro instead of the 'mov $symbol, %dst' construct. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/processor.h | 9 ++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index e28add6b791f..7ae9fb91f7b5 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -50,7 +50,7 @@ static inline void *current_text_addr(void) { void *pc; - asm volatile("mov $1f, %0; 1:":"=r" (pc)); + asm volatile(_ASM_MOVABS " $1f, %0; 1:":"=r" (pc)); return pc; } @@ -711,6 +711,7 @@ static inline void sync_core(void) : ASM_CALL_CONSTRAINT : : "memory"); #else unsigned int tmp; + unsigned long tmp2; asm volatile ( UNWIND_HINT_SAVE @@ -721,11 +722,13 @@ static inline void sync_core(void) "pushfq\n\t" "mov %%cs, %0\n\t" "pushq %q0\n\t" - "pushq $1f\n\t" + "leaq 1f(%%rip), %1\n\t" + "pushq %1\n\t" "iretq\n\t" UNWIND_HINT_RESTORE "1:" - : "=" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); + : "=" (tmp), "=" (tmp2), ASM_CALL_CONSTRAINT + : : "cc", "memory"); #endif } -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 13/27] x86/boot/64: Build head64.c as mcmodel large when PIE is enabled
The __startup_64 function assumes all symbols have relocated addresses instead of the current boot virtual address. PIE generated code favor relative addresses making all virtual and physical address math incorrect. If PIE is enabled, build head64.c as mcmodel large instead to ensure absolute references on all memory access. Add a global __force_order variable required when using a large model with read_cr* functions. To build head64.c as mcmodel=large, disable the retpoline gcc flags. This code is used at early boot and removed later, it doesn't need retpoline mitigation. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/kernel/Makefile | 6 ++ arch/x86/kernel/head64.c | 3 +++ 2 files changed, 9 insertions(+) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 02d6f5cf4e70..0f6da4b216e0 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -22,6 +22,12 @@ CFLAGS_REMOVE_early_printk.o = -pg CFLAGS_REMOVE_head64.o = -pg endif +ifdef CONFIG_X86_PIE +# Remove PIE and retpoline flags that are incompatible with mcmodel=large +CFLAGS_REMOVE_head64.o += -fPIE -mindirect-branch=thunk-extern -mindirect-branch-register +CFLAGS_head64.o = -mcmodel=large +endif + KASAN_SANITIZE_head$(BITS).o := n KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 2d29e47c056e..fa661fb97127 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -64,6 +64,9 @@ EXPORT_SYMBOL(vmemmap_base); #define __head __section(.head.text) +/* Required for read_cr3 when building as PIE */ +unsigned long __force_order; + static void __head *fixup_pointer(void *ptr, unsigned long physaddr) { return ptr - (void *)_text + (void *)physaddr; -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 11/27] x86/power/64: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier Acked-by: Pavel Machek --- arch/x86/power/hibernate_asm_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index ce8da3a0412c..6fdd7bbc3c33 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -24,7 +24,7 @@ #include ENTRY(swsusp_arch_suspend) - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movq%rsp, pt_regs_sp(%rax) movq%rbp, pt_regs_bp(%rax) movq%rsi, pt_regs_si(%rax) @@ -115,7 +115,7 @@ ENTRY(restore_registers) movq%rax, %cr4; # turn PGE back on /* We don't restore %rax, it must be 0 anyway */ - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movqpt_regs_sp(%rax), %rsp movqpt_regs_bp(%rax), %rbp movqpt_regs_si(%rax), %rsi -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 11/27] x86/power/64: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier Acked-by: Pavel Machek --- arch/x86/power/hibernate_asm_64.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/power/hibernate_asm_64.S b/arch/x86/power/hibernate_asm_64.S index ce8da3a0412c..6fdd7bbc3c33 100644 --- a/arch/x86/power/hibernate_asm_64.S +++ b/arch/x86/power/hibernate_asm_64.S @@ -24,7 +24,7 @@ #include ENTRY(swsusp_arch_suspend) - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movq%rsp, pt_regs_sp(%rax) movq%rbp, pt_regs_bp(%rax) movq%rsi, pt_regs_si(%rax) @@ -115,7 +115,7 @@ ENTRY(restore_registers) movq%rax, %cr4; # turn PGE back on /* We don't restore %rax, it must be 0 anyway */ - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movqpt_regs_sp(%rax), %rsp movqpt_regs_bp(%rax), %rbp movqpt_regs_si(%rax), %rsi -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 17/27] x86/relocs: Handle PIE relocations
Change the relocation tool to correctly handle relocations generated by -fPIE option: - Add relocation for each entry of the .got section given the linker does not generate R_X86_64_GLOB_DAT on a simple link. - Ignore R_X86_64_GOTPCREL. Signed-off-by: Thomas Garnier --- arch/x86/tools/relocs.c | 93 - 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 220e97841e49..a35cc337f883 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -32,6 +32,7 @@ struct section { Elf_Sym*symtab; Elf_Rel*reltab; char *strtab; + Elf_Addr *got; }; static struct section *secs; @@ -293,6 +294,35 @@ static Elf_Sym *sym_lookup(const char *symname) return 0; } +static Elf_Sym *sym_lookup_addr(Elf_Addr addr, const char **name) +{ + int i; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = [i]; + long nsyms; + Elf_Sym *symtab; + Elf_Sym *sym; + + if (sec->shdr.sh_type != SHT_SYMTAB) + continue; + + nsyms = sec->shdr.sh_size/sizeof(Elf_Sym); + symtab = sec->symtab; + + for (sym = symtab; --nsyms >= 0; sym++) { + if (sym->st_value == addr) { + if (name) { + *name = sym_name(sec->link->strtab, +sym); + } + return sym; + } + } + } + return 0; +} + + #if BYTE_ORDER == LITTLE_ENDIAN #define le16_to_cpu(val) (val) #define le32_to_cpu(val) (val) @@ -513,6 +543,33 @@ static void read_relocs(FILE *fp) } } +static void read_got(FILE *fp) +{ + int i; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = [i]; + sec->got = NULL; + if (sec->shdr.sh_type != SHT_PROGBITS || + strcmp(sec_name(i), ".got")) { + continue; + } + sec->got = malloc(sec->shdr.sh_size); + if (!sec->got) { + die("malloc of %d bytes for got failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->got, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read got: %s\n", + strerror(errno)); + } + } +} + static void print_absolute_symbols(void) { @@ -643,6 +700,32 @@ static void add_reloc(struct relocs *r, uint32_t offset) r->offset[r->count++] = offset; } +/* + * The linker does not generate relocations for the GOT for the kernel. + * If a GOT is found, simulate the relocations that should have been included. + */ +static void walk_got_table(int (*process)(struct section *sec, Elf_Rel *rel, + Elf_Sym *sym, const char *symname), + struct section *sec) +{ + int i; + Elf_Addr entry; + Elf_Sym *sym; + const char *symname; + Elf_Rel rel; + + for (i = 0; i < sec->shdr.sh_size/sizeof(Elf_Addr); i++) { + entry = sec->got[i]; + sym = sym_lookup_addr(entry, ); + if (!sym) + die("Could not found got symbol for entry %d\n", i); + rel.r_offset = sec->shdr.sh_addr + i * sizeof(Elf_Addr); + rel.r_info = ELF_BITS == 64 ? R_X86_64_GLOB_DAT +: R_386_GLOB_DAT; + process(sec, , sym, symname); + } +} + static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname)) { @@ -656,6 +739,8 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, struct section *sec = [i]; if (sec->shdr.sh_type != SHT_REL_TYPE) { + if (sec->got) + walk_got_table(process, sec); continue; } sec_symtab = sec->link; @@ -765,6 +850,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, offset += per_cpu_load_addr; switch (r_type) { + case R_X86_64_GOTPCREL: case R_X86_64_NONE: /*
[PATCH v4 17/27] x86/relocs: Handle PIE relocations
Change the relocation tool to correctly handle relocations generated by -fPIE option: - Add relocation for each entry of the .got section given the linker does not generate R_X86_64_GLOB_DAT on a simple link. - Ignore R_X86_64_GOTPCREL. Signed-off-by: Thomas Garnier --- arch/x86/tools/relocs.c | 93 - 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 220e97841e49..a35cc337f883 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -32,6 +32,7 @@ struct section { Elf_Sym*symtab; Elf_Rel*reltab; char *strtab; + Elf_Addr *got; }; static struct section *secs; @@ -293,6 +294,35 @@ static Elf_Sym *sym_lookup(const char *symname) return 0; } +static Elf_Sym *sym_lookup_addr(Elf_Addr addr, const char **name) +{ + int i; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = [i]; + long nsyms; + Elf_Sym *symtab; + Elf_Sym *sym; + + if (sec->shdr.sh_type != SHT_SYMTAB) + continue; + + nsyms = sec->shdr.sh_size/sizeof(Elf_Sym); + symtab = sec->symtab; + + for (sym = symtab; --nsyms >= 0; sym++) { + if (sym->st_value == addr) { + if (name) { + *name = sym_name(sec->link->strtab, +sym); + } + return sym; + } + } + } + return 0; +} + + #if BYTE_ORDER == LITTLE_ENDIAN #define le16_to_cpu(val) (val) #define le32_to_cpu(val) (val) @@ -513,6 +543,33 @@ static void read_relocs(FILE *fp) } } +static void read_got(FILE *fp) +{ + int i; + for (i = 0; i < ehdr.e_shnum; i++) { + struct section *sec = [i]; + sec->got = NULL; + if (sec->shdr.sh_type != SHT_PROGBITS || + strcmp(sec_name(i), ".got")) { + continue; + } + sec->got = malloc(sec->shdr.sh_size); + if (!sec->got) { + die("malloc of %d bytes for got failed\n", + sec->shdr.sh_size); + } + if (fseek(fp, sec->shdr.sh_offset, SEEK_SET) < 0) { + die("Seek to %d failed: %s\n", + sec->shdr.sh_offset, strerror(errno)); + } + if (fread(sec->got, 1, sec->shdr.sh_size, fp) + != sec->shdr.sh_size) { + die("Cannot read got: %s\n", + strerror(errno)); + } + } +} + static void print_absolute_symbols(void) { @@ -643,6 +700,32 @@ static void add_reloc(struct relocs *r, uint32_t offset) r->offset[r->count++] = offset; } +/* + * The linker does not generate relocations for the GOT for the kernel. + * If a GOT is found, simulate the relocations that should have been included. + */ +static void walk_got_table(int (*process)(struct section *sec, Elf_Rel *rel, + Elf_Sym *sym, const char *symname), + struct section *sec) +{ + int i; + Elf_Addr entry; + Elf_Sym *sym; + const char *symname; + Elf_Rel rel; + + for (i = 0; i < sec->shdr.sh_size/sizeof(Elf_Addr); i++) { + entry = sec->got[i]; + sym = sym_lookup_addr(entry, ); + if (!sym) + die("Could not found got symbol for entry %d\n", i); + rel.r_offset = sec->shdr.sh_addr + i * sizeof(Elf_Addr); + rel.r_info = ELF_BITS == 64 ? R_X86_64_GLOB_DAT +: R_386_GLOB_DAT; + process(sec, , sym, symname); + } +} + static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname)) { @@ -656,6 +739,8 @@ static void walk_relocs(int (*process)(struct section *sec, Elf_Rel *rel, struct section *sec = [i]; if (sec->shdr.sh_type != SHT_REL_TYPE) { + if (sec->got) + walk_got_table(process, sec); continue; } sec_symtab = sec->link; @@ -765,6 +850,7 @@ static int do_reloc64(struct section *sec, Elf_Rel *rel, ElfW(Sym) *sym, offset += per_cpu_load_addr; switch (r_type) { + case R_X86_64_GOTPCREL: case R_X86_64_NONE: /*
[PATCH v4 21/27] x86/ftrace: Adapt function tracing for PIE support
When using -fPIE/PIC with function tracing, the compiler generates a call through the GOT (call *__fentry__@GOTPCREL). This instruction takes 6 bytes instead of 5 on the usual relative call. If PIE is enabled, replace the 6th byte of the GOT call by a 1-byte nop so ftrace can handle the previous 5-bytes as before. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/ftrace.h | 4 -- arch/x86/include/asm/sections.h | 4 ++ arch/x86/kernel/ftrace.c| 42 +- scripts/recordmcount.c | 79 ++--- 4 files changed, 97 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index c18ed65287d5..b1eb3f6735fc 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -24,10 +24,6 @@ extern void __fentry__(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { - /* -* addr is the address of the mcount call instruction. -* recordmcount does the necessary offset calculation. -*/ return addr; } diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 5c019d23d06b..da3d98bb2bcb 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -13,4 +13,8 @@ extern char __end_rodata_hpage_align[]; extern char __entry_trampoline_start[], __entry_trampoline_end[]; #endif +#if defined(CONFIG_X86_PIE) +extern char __start_got[], __end_got[]; +#endif + #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 01ebcb6f263e..73b3c30cb7a3 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -102,7 +102,7 @@ static const unsigned char *ftrace_nop_replace(void) static int ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code) + unsigned const char *new_code) { unsigned char replaced[MCOUNT_INSN_SIZE]; @@ -135,6 +135,44 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, return 0; } +/* Bytes before call GOT offset */ +const unsigned char got_call_preinsn[] = { 0xff, 0x15 }; + +static int +ftrace_modify_initial_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code) +{ + unsigned char replaced[MCOUNT_INSN_SIZE + 1]; + + ftrace_expected = old_code; + + /* +* If PIE is not enabled or no GOT call was found, default to the +* original approach to code modification. +*/ + if (!IS_ENABLED(CONFIG_X86_PIE) || + probe_kernel_read(replaced, (void *)ip, sizeof(replaced)) || + memcmp(replaced, got_call_preinsn, sizeof(got_call_preinsn))) + return ftrace_modify_code_direct(ip, old_code, new_code); + + /* +* Build a nop slide with a 5-byte nop and 1-byte nop to keep the ftrace +* hooking algorithm working with the expected 5 bytes instruction. +*/ + memcpy(replaced, new_code, MCOUNT_INSN_SIZE); + replaced[MCOUNT_INSN_SIZE] = ideal_nops[1][0]; + + ip = text_ip_addr(ip); + + if (probe_kernel_write((void *)ip, replaced, sizeof(replaced))) + return -EPERM; + + sync_core(); + + return 0; + +} + int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { @@ -153,7 +191,7 @@ int ftrace_make_nop(struct module *mod, * just modify the code directly. */ if (addr == MCOUNT_ADDR) - return ftrace_modify_code_direct(rec->ip, old, new); + return ftrace_modify_initial_code(rec->ip, old, new); ftrace_expected = NULL; diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 895c40e8679f..aa71b912958d 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -171,33 +171,9 @@ umalloc(size_t size) return addr; } -static unsigned char ideal_nop5_x86_64[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; -static unsigned char ideal_nop5_x86_32[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; -static unsigned char *ideal_nop; - static char rel_type_nop; - static int (*make_nop)(void *map, size_t const offset); - -static int make_nop_x86(void *map, size_t const offset) -{ - uint32_t *ptr; - unsigned char *op; - - /* Confirm we have 0xe8 0x0 0x0 0x0 0x0 */ - ptr = map + offset; - if (*ptr != 0) - return -1; - - op = map + offset - 1; - if (*op != 0xe8) - return -1; - - /* convert to nop */ - ulseek(fd_map, offset - 1, SEEK_SET); - uwrite(fd_map, ideal_nop, 5); - return 0; -} +static unsigned char *ideal_nop; static unsigned char ideal_nop4_arm_le[4] =
[PATCH v4 21/27] x86/ftrace: Adapt function tracing for PIE support
When using -fPIE/PIC with function tracing, the compiler generates a call through the GOT (call *__fentry__@GOTPCREL). This instruction takes 6 bytes instead of 5 on the usual relative call. If PIE is enabled, replace the 6th byte of the GOT call by a 1-byte nop so ftrace can handle the previous 5-bytes as before. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/ftrace.h | 4 -- arch/x86/include/asm/sections.h | 4 ++ arch/x86/kernel/ftrace.c| 42 +- scripts/recordmcount.c | 79 ++--- 4 files changed, 97 insertions(+), 32 deletions(-) diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index c18ed65287d5..b1eb3f6735fc 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -24,10 +24,6 @@ extern void __fentry__(void); static inline unsigned long ftrace_call_adjust(unsigned long addr) { - /* -* addr is the address of the mcount call instruction. -* recordmcount does the necessary offset calculation. -*/ return addr; } diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index 5c019d23d06b..da3d98bb2bcb 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -13,4 +13,8 @@ extern char __end_rodata_hpage_align[]; extern char __entry_trampoline_start[], __entry_trampoline_end[]; #endif +#if defined(CONFIG_X86_PIE) +extern char __start_got[], __end_got[]; +#endif + #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 01ebcb6f263e..73b3c30cb7a3 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -102,7 +102,7 @@ static const unsigned char *ftrace_nop_replace(void) static int ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, - unsigned const char *new_code) + unsigned const char *new_code) { unsigned char replaced[MCOUNT_INSN_SIZE]; @@ -135,6 +135,44 @@ ftrace_modify_code_direct(unsigned long ip, unsigned const char *old_code, return 0; } +/* Bytes before call GOT offset */ +const unsigned char got_call_preinsn[] = { 0xff, 0x15 }; + +static int +ftrace_modify_initial_code(unsigned long ip, unsigned const char *old_code, + unsigned const char *new_code) +{ + unsigned char replaced[MCOUNT_INSN_SIZE + 1]; + + ftrace_expected = old_code; + + /* +* If PIE is not enabled or no GOT call was found, default to the +* original approach to code modification. +*/ + if (!IS_ENABLED(CONFIG_X86_PIE) || + probe_kernel_read(replaced, (void *)ip, sizeof(replaced)) || + memcmp(replaced, got_call_preinsn, sizeof(got_call_preinsn))) + return ftrace_modify_code_direct(ip, old_code, new_code); + + /* +* Build a nop slide with a 5-byte nop and 1-byte nop to keep the ftrace +* hooking algorithm working with the expected 5 bytes instruction. +*/ + memcpy(replaced, new_code, MCOUNT_INSN_SIZE); + replaced[MCOUNT_INSN_SIZE] = ideal_nops[1][0]; + + ip = text_ip_addr(ip); + + if (probe_kernel_write((void *)ip, replaced, sizeof(replaced))) + return -EPERM; + + sync_core(); + + return 0; + +} + int ftrace_make_nop(struct module *mod, struct dyn_ftrace *rec, unsigned long addr) { @@ -153,7 +191,7 @@ int ftrace_make_nop(struct module *mod, * just modify the code directly. */ if (addr == MCOUNT_ADDR) - return ftrace_modify_code_direct(rec->ip, old, new); + return ftrace_modify_initial_code(rec->ip, old, new); ftrace_expected = NULL; diff --git a/scripts/recordmcount.c b/scripts/recordmcount.c index 895c40e8679f..aa71b912958d 100644 --- a/scripts/recordmcount.c +++ b/scripts/recordmcount.c @@ -171,33 +171,9 @@ umalloc(size_t size) return addr; } -static unsigned char ideal_nop5_x86_64[5] = { 0x0f, 0x1f, 0x44, 0x00, 0x00 }; -static unsigned char ideal_nop5_x86_32[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 }; -static unsigned char *ideal_nop; - static char rel_type_nop; - static int (*make_nop)(void *map, size_t const offset); - -static int make_nop_x86(void *map, size_t const offset) -{ - uint32_t *ptr; - unsigned char *op; - - /* Confirm we have 0xe8 0x0 0x0 0x0 0x0 */ - ptr = map + offset; - if (*ptr != 0) - return -1; - - op = map + offset - 1; - if (*op != 0xe8) - return -1; - - /* convert to nop */ - ulseek(fd_map, offset - 1, SEEK_SET); - uwrite(fd_map, ideal_nop, 5); - return 0; -} +static unsigned char *ideal_nop; static unsigned char ideal_nop4_arm_le[4] =
[PATCH v4 23/27] x86/modules: Adapt module loading for PIE support
Adapt module loading to support PIE relocations. Generate dynamic GOT if a symbol requires it but no entry exists in the kernel GOT. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/Makefile | 4 + arch/x86/include/asm/module.h | 11 ++ arch/x86/include/asm/sections.h | 4 + arch/x86/kernel/module.c| 181 +++- arch/x86/kernel/module.lds | 3 + 5 files changed, 198 insertions(+), 5 deletions(-) create mode 100644 arch/x86/kernel/module.lds diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 277ffc57ae13..20bb6cbd8938 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -134,7 +134,11 @@ else KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += -mno-red-zone +ifdef CONFIG_X86_PIE +KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/x86/kernel/module.lds +else KBUILD_CFLAGS += -mcmodel=kernel +endif # -funit-at-a-time shrinks the kernel .text considerably # unfortunately it makes reading oopses harder. diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 7948a17febb4..68ff05e14288 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,12 +5,23 @@ #include #include +#ifdef CONFIG_X86_PIE +struct mod_got_sec { + struct elf64_shdr *got; + int got_num_entries; + int got_max_entries; +}; +#endif + struct mod_arch_specific { #ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; #endif +#ifdef CONFIG_X86_PIE + struct mod_got_sec core; +#endif }; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index da3d98bb2bcb..89b3a95c8d11 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -17,4 +17,8 @@ extern char __entry_trampoline_start[], __entry_trampoline_end[]; extern char __start_got[], __end_got[]; #endif +#if defined(CONFIG_X86_PIE) +extern char __start_got[], __end_got[]; +#endif + #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f58336af095c..88895f3d474b 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -77,6 +78,173 @@ static unsigned long int get_module_load_offset(void) } #endif +#ifdef CONFIG_X86_PIE +static u64 find_got_kernel_entry(Elf64_Sym *sym, const Elf64_Rela *rela) +{ + u64 *pos; + + for (pos = (u64*)__start_got; pos < (u64*)__end_got; pos++) { + if (*pos == sym->st_value) + return (u64)pos + rela->r_addend; + } + + return 0; +} + +static u64 module_emit_got_entry(struct module *mod, void *loc, +const Elf64_Rela *rela, Elf64_Sym *sym) +{ + struct mod_got_sec *gotsec = >arch.core; + u64 *got = (u64*)gotsec->got->sh_addr; + int i = gotsec->got_num_entries; + u64 ret; + + /* Check if we can use the kernel GOT */ + ret = find_got_kernel_entry(sym, rela); + if (ret) + return ret; + + got[i] = sym->st_value; + + /* +* Check if the entry we just created is a duplicate. Given that the +* relocations are sorted, this will be the last entry we allocated. +* (if one exists). +*/ + if (i > 0 && got[i] == got[i - 2]) { + ret = (u64)[i - 1]; + } else { + gotsec->got_num_entries++; + BUG_ON(gotsec->got_num_entries > gotsec->got_max_entries); + ret = (u64)[i]; + } + + return ret + rela->r_addend; +} + +#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rela(const void *a, const void *b) +{ + const Elf64_Rela *x = a, *y = b; + int i; + + /* sort by type, symbol index and addend */ + i = cmp_3way(ELF64_R_TYPE(x->r_info), ELF64_R_TYPE(y->r_info)); + if (i == 0) + i = cmp_3way(ELF64_R_SYM(x->r_info), ELF64_R_SYM(y->r_info)); + if (i == 0) + i = cmp_3way(x->r_addend, y->r_addend); + return i; +} + +static bool duplicate_rel(const Elf64_Rela *rela, int num) +{ + /* +* Entries are sorted by type, symbol index and addend. That means +* that, if a duplicate entry exists, it must be in the preceding +* slot. +*/ + return num > 0 && cmp_rela(rela + num, rela + num - 1) == 0; +} + +static unsigned int count_gots(Elf64_Sym *syms, Elf64_Rela *rela, int num) +{ + unsigned int ret = 0; + Elf64_Sym *s; + int i; + + for (i = 0; i <
[PATCH v4 22/27] x86/modules: Add option to start module section after kernel
Add an option so the module section is just after the mapped kernel. It will ensure position independent modules are always at the right distance from the kernel and do not require mcmodule=large. It also optimize the available size for modules by getting rid of the empty space on kernel randomization range. Signed-off-by: Thomas Garnier --- Documentation/x86/x86_64/mm.txt | 3 +++ arch/x86/Kconfig| 4 arch/x86/include/asm/pgtable_64_types.h | 6 ++ arch/x86/kernel/head64.c| 5 - arch/x86/mm/dump_pagetables.c | 3 ++- 5 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 5432a96d31ff..334ab458c82d 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -77,3 +77,6 @@ Their order is preserved but their base will be offset early at boot time. Be very careful vs. KASLR when changing anything here. The KASLR address range must not overlap with anything except the KASAN shadow area, which is correct as KASAN disables KASLR. + +If CONFIG_DYNAMIC_MODULE_BASE is enabled, the module section follows the end of +the mapped kernel. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 177e712201d1..94a00d81ec18 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2198,6 +2198,10 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING If unsure, leave at the default value. +# Module section starts just after the end of the kernel module +config DYNAMIC_MODULE_BASE + bool + config X86_GLOBAL_STACKPROTECTOR bool "Stack cookie using a global variable" depends on CC_STACKPROTECTOR_AUTO diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index adb47552e6bb..3ab25b908879 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ #include #include +#include /* * These are used to make use of C type-checking.. @@ -126,7 +127,12 @@ extern unsigned int ptrs_per_p4d; #define VMALLOC_END(VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) +#ifdef CONFIG_DYNAMIC_MODULE_BASE +#define MODULES_VADDR ALIGN(((unsigned long)_end + PAGE_SIZE), PMD_SIZE) +#else #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) +#endif + /* The module sections ends with the start of the fixmap */ #define MODULES_END_AC(0xff00, UL) #define MODULES_LEN(MODULES_END - MODULES_VADDR) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index fa661fb97127..3a1ce822e1c0 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -394,12 +394,15 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) * Build-time sanity checks on the kernel image and module * area mappings. (these are purely build-time and produce no code) */ +#ifndef CONFIG_DYNAMIC_MODULE_BASE BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map); BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE); - BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); + BUILD_BUG_ON(!IS_ENABLED(CONFIG_RANDOMIZE_BASE_LARGE) && +MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0); BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); +#endif MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index cc7ff5957194..dca4098ce4fd 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -105,7 +105,7 @@ static struct addr_marker address_markers[] = { [EFI_END_NR]= { EFI_VA_END, "EFI Runtime Services" }, #endif [HIGH_KERNEL_NR]= { __START_KERNEL_map, "High Kernel Mapping" }, - [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, + [MODULES_VADDR_NR] = { 0/*MODULES_VADDR*/, "Modules" }, [MODULES_END_NR]= { MODULES_END,"End Modules" }, [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, [END_OF_SPACE_NR] = { -1, NULL } @@ -600,6 +600,7 @@ static int __init pt_dump_init(void) address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; #endif + address_markers[MODULES_VADDR_NR].start_address = MODULES_VADDR; #endif #ifdef CONFIG_X86_32 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 23/27] x86/modules: Adapt module loading for PIE support
Adapt module loading to support PIE relocations. Generate dynamic GOT if a symbol requires it but no entry exists in the kernel GOT. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/Makefile | 4 + arch/x86/include/asm/module.h | 11 ++ arch/x86/include/asm/sections.h | 4 + arch/x86/kernel/module.c| 181 +++- arch/x86/kernel/module.lds | 3 + 5 files changed, 198 insertions(+), 5 deletions(-) create mode 100644 arch/x86/kernel/module.lds diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 277ffc57ae13..20bb6cbd8938 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -134,7 +134,11 @@ else KBUILD_CFLAGS += $(cflags-y) KBUILD_CFLAGS += -mno-red-zone +ifdef CONFIG_X86_PIE +KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/x86/kernel/module.lds +else KBUILD_CFLAGS += -mcmodel=kernel +endif # -funit-at-a-time shrinks the kernel .text considerably # unfortunately it makes reading oopses harder. diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h index 7948a17febb4..68ff05e14288 100644 --- a/arch/x86/include/asm/module.h +++ b/arch/x86/include/asm/module.h @@ -5,12 +5,23 @@ #include #include +#ifdef CONFIG_X86_PIE +struct mod_got_sec { + struct elf64_shdr *got; + int got_num_entries; + int got_max_entries; +}; +#endif + struct mod_arch_specific { #ifdef CONFIG_UNWINDER_ORC unsigned int num_orcs; int *orc_unwind_ip; struct orc_entry *orc_unwind; #endif +#ifdef CONFIG_X86_PIE + struct mod_got_sec core; +#endif }; #ifdef CONFIG_X86_64 diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h index da3d98bb2bcb..89b3a95c8d11 100644 --- a/arch/x86/include/asm/sections.h +++ b/arch/x86/include/asm/sections.h @@ -17,4 +17,8 @@ extern char __entry_trampoline_start[], __entry_trampoline_end[]; extern char __start_got[], __end_got[]; #endif +#if defined(CONFIG_X86_PIE) +extern char __start_got[], __end_got[]; +#endif + #endif /* _ASM_X86_SECTIONS_H */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index f58336af095c..88895f3d474b 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -77,6 +78,173 @@ static unsigned long int get_module_load_offset(void) } #endif +#ifdef CONFIG_X86_PIE +static u64 find_got_kernel_entry(Elf64_Sym *sym, const Elf64_Rela *rela) +{ + u64 *pos; + + for (pos = (u64*)__start_got; pos < (u64*)__end_got; pos++) { + if (*pos == sym->st_value) + return (u64)pos + rela->r_addend; + } + + return 0; +} + +static u64 module_emit_got_entry(struct module *mod, void *loc, +const Elf64_Rela *rela, Elf64_Sym *sym) +{ + struct mod_got_sec *gotsec = >arch.core; + u64 *got = (u64*)gotsec->got->sh_addr; + int i = gotsec->got_num_entries; + u64 ret; + + /* Check if we can use the kernel GOT */ + ret = find_got_kernel_entry(sym, rela); + if (ret) + return ret; + + got[i] = sym->st_value; + + /* +* Check if the entry we just created is a duplicate. Given that the +* relocations are sorted, this will be the last entry we allocated. +* (if one exists). +*/ + if (i > 0 && got[i] == got[i - 2]) { + ret = (u64)[i - 1]; + } else { + gotsec->got_num_entries++; + BUG_ON(gotsec->got_num_entries > gotsec->got_max_entries); + ret = (u64)[i]; + } + + return ret + rela->r_addend; +} + +#define cmp_3way(a,b) ((a) < (b) ? -1 : (a) > (b)) + +static int cmp_rela(const void *a, const void *b) +{ + const Elf64_Rela *x = a, *y = b; + int i; + + /* sort by type, symbol index and addend */ + i = cmp_3way(ELF64_R_TYPE(x->r_info), ELF64_R_TYPE(y->r_info)); + if (i == 0) + i = cmp_3way(ELF64_R_SYM(x->r_info), ELF64_R_SYM(y->r_info)); + if (i == 0) + i = cmp_3way(x->r_addend, y->r_addend); + return i; +} + +static bool duplicate_rel(const Elf64_Rela *rela, int num) +{ + /* +* Entries are sorted by type, symbol index and addend. That means +* that, if a duplicate entry exists, it must be in the preceding +* slot. +*/ + return num > 0 && cmp_rela(rela + num, rela + num - 1) == 0; +} + +static unsigned int count_gots(Elf64_Sym *syms, Elf64_Rela *rela, int num) +{ + unsigned int ret = 0; + Elf64_Sym *s; + int i; + + for (i = 0; i <
[PATCH v4 22/27] x86/modules: Add option to start module section after kernel
Add an option so the module section is just after the mapped kernel. It will ensure position independent modules are always at the right distance from the kernel and do not require mcmodule=large. It also optimize the available size for modules by getting rid of the empty space on kernel randomization range. Signed-off-by: Thomas Garnier --- Documentation/x86/x86_64/mm.txt | 3 +++ arch/x86/Kconfig| 4 arch/x86/include/asm/pgtable_64_types.h | 6 ++ arch/x86/kernel/head64.c| 5 - arch/x86/mm/dump_pagetables.c | 3 ++- 5 files changed, 19 insertions(+), 2 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 5432a96d31ff..334ab458c82d 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -77,3 +77,6 @@ Their order is preserved but their base will be offset early at boot time. Be very careful vs. KASLR when changing anything here. The KASLR address range must not overlap with anything except the KASAN shadow area, which is correct as KASAN disables KASLR. + +If CONFIG_DYNAMIC_MODULE_BASE is enabled, the module section follows the end of +the mapped kernel. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 177e712201d1..94a00d81ec18 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2198,6 +2198,10 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING If unsure, leave at the default value. +# Module section starts just after the end of the kernel module +config DYNAMIC_MODULE_BASE + bool + config X86_GLOBAL_STACKPROTECTOR bool "Stack cookie using a global variable" depends on CC_STACKPROTECTOR_AUTO diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index adb47552e6bb..3ab25b908879 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -7,6 +7,7 @@ #ifndef __ASSEMBLY__ #include #include +#include /* * These are used to make use of C type-checking.. @@ -126,7 +127,12 @@ extern unsigned int ptrs_per_p4d; #define VMALLOC_END(VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) +#ifdef CONFIG_DYNAMIC_MODULE_BASE +#define MODULES_VADDR ALIGN(((unsigned long)_end + PAGE_SIZE), PMD_SIZE) +#else #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) +#endif + /* The module sections ends with the start of the fixmap */ #define MODULES_END_AC(0xff00, UL) #define MODULES_LEN(MODULES_END - MODULES_VADDR) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index fa661fb97127..3a1ce822e1c0 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -394,12 +394,15 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) * Build-time sanity checks on the kernel image and module * area mappings. (these are purely build-time and produce no code) */ +#ifndef CONFIG_DYNAMIC_MODULE_BASE BUILD_BUG_ON(MODULES_VADDR < __START_KERNEL_map); BUILD_BUG_ON(MODULES_VADDR - __START_KERNEL_map < KERNEL_IMAGE_SIZE); - BUILD_BUG_ON(MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); + BUILD_BUG_ON(!IS_ENABLED(CONFIG_RANDOMIZE_BASE_LARGE) && +MODULES_LEN + KERNEL_IMAGE_SIZE > 2*PUD_SIZE); BUILD_BUG_ON((__START_KERNEL_map & ~PMD_MASK) != 0); BUILD_BUG_ON((MODULES_VADDR & ~PMD_MASK) != 0); BUILD_BUG_ON(!(MODULES_VADDR > __START_KERNEL)); +#endif MAYBE_BUILD_BUG_ON(!(((MODULES_END - 1) & PGDIR_MASK) == (__START_KERNEL & PGDIR_MASK))); BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END); diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index cc7ff5957194..dca4098ce4fd 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -105,7 +105,7 @@ static struct addr_marker address_markers[] = { [EFI_END_NR]= { EFI_VA_END, "EFI Runtime Services" }, #endif [HIGH_KERNEL_NR]= { __START_KERNEL_map, "High Kernel Mapping" }, - [MODULES_VADDR_NR] = { MODULES_VADDR, "Modules" }, + [MODULES_VADDR_NR] = { 0/*MODULES_VADDR*/, "Modules" }, [MODULES_END_NR]= { MODULES_END,"End Modules" }, [FIXADDR_START_NR] = { FIXADDR_START, "Fixmap Area" }, [END_OF_SPACE_NR] = { -1, NULL } @@ -600,6 +600,7 @@ static int __init pt_dump_init(void) address_markers[KASAN_SHADOW_START_NR].start_address = KASAN_SHADOW_START; address_markers[KASAN_SHADOW_END_NR].start_address = KASAN_SHADOW_END; #endif + address_markers[MODULES_VADDR_NR].start_address = MODULES_VADDR; #endif #ifdef CONFIG_X86_32 address_markers[VMALLOC_START_NR].start_address = VMALLOC_START; -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 20/27] x86: Support global stack cookie
Add an off-by-default configuration option to use a global stack cookie instead of the default TLS. This configuration option will only be used with PIE binaries. For kernel stack cookie, the compiler uses the mcmodel=kernel to switch between the fs segment to gs segment. A PIE binary does not use mcmodel=kernel because it can be relocated anywhere, therefore the compiler will default to the fs segment register. This is fixed on the latest version of gcc. If the segment selector is available, it will be automatically added. If the automatic configuration was selected, a warning is written and the global variable stack cookie is used. If a specific stack mode was selected (regular or strong) and the compiler does not support selecting the segment register, an error is emitted. Signed-off-by: Thomas Garnier --- arch/x86/Kconfig | 12 arch/x86/Makefile | 9 + arch/x86/entry/entry_32.S | 3 ++- arch/x86/entry/entry_64.S | 3 ++- arch/x86/include/asm/processor.h | 3 ++- arch/x86/include/asm/stackprotector.h | 19 ++- arch/x86/kernel/asm-offsets.c | 3 ++- arch/x86/kernel/asm-offsets_32.c | 3 ++- arch/x86/kernel/asm-offsets_64.c | 3 ++- arch/x86/kernel/cpu/common.c | 3 ++- arch/x86/kernel/head_32.S | 3 ++- arch/x86/kernel/process.c | 5 + 12 files changed, 56 insertions(+), 13 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1434ec78e556..177e712201d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2198,6 +2198,18 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING If unsure, leave at the default value. +config X86_GLOBAL_STACKPROTECTOR + bool "Stack cookie using a global variable" + depends on CC_STACKPROTECTOR_AUTO + default n + ---help--- + This option turns on the "stack-protector" GCC feature using a global + variable instead of a segment register. It is useful when the + compiler does not support custom segment registers when building a + position independent (PIE) binary. + + If unsure, say N + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 60135cbd905c..277ffc57ae13 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -141,6 +141,15 @@ else KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time) endif +ifdef CONFIG_X86_GLOBAL_STACKPROTECTOR +ifeq ($(call cc-option, -mstack-protector-guard=global),) +$(error Cannot use CONFIG_X86_GLOBAL_STACKPROTECTOR: \ +-mstack-protector-guard=global not supported \ +by compiler) +endif +KBUILD_CFLAGS += -mstack-protector-guard=global +endif + ifdef CONFIG_X86_X32 x32_ld_ok := $(call try-run,\ /bin/echo -e '1: .quad 1b' | \ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index bef8e2b202a8..b7d5bc710ae7 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -239,7 +239,8 @@ ENTRY(__switch_to_asm) movl%esp, TASK_threadsp(%eax) movlTASK_threadsp(%edx), %esp -#ifdef CONFIG_CC_STACKPROTECTOR +#if defined(CONFIG_CC_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) movlTASK_stack_canary(%edx), %ebx movl%ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f9b42ca4bf60..a3d1ca4ec516 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -357,7 +357,8 @@ ENTRY(__switch_to_asm) movq%rsp, TASK_threadsp(%rdi) movqTASK_threadsp(%rsi), %rsp -#ifdef CONFIG_CC_STACKPROTECTOR +#if defined(CONFIG_CC_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) movqTASK_stack_canary(%rsi), %rbx movq%rbx, PER_CPU_VAR(irq_stack_union + stack_canary_offset) #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 8162b5a24d8c..566fa5c56148 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -414,7 +414,8 @@ extern asmlinkage void ignore_sysret(void); void save_fsgs_for_kvm(void); #endif #else /* X86_64 */ -#ifdef CONFIG_CC_STACKPROTECTOR +#if defined(CONFIG_CC_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) /* * Make sure stack canary segment base is cached-aligned: * "For Intel Atom processors, avoid non zero segment base address diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 371b3a4af000..5063f57d99f5 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackpr
[PATCH v4 20/27] x86: Support global stack cookie
Add an off-by-default configuration option to use a global stack cookie instead of the default TLS. This configuration option will only be used with PIE binaries. For kernel stack cookie, the compiler uses the mcmodel=kernel to switch between the fs segment to gs segment. A PIE binary does not use mcmodel=kernel because it can be relocated anywhere, therefore the compiler will default to the fs segment register. This is fixed on the latest version of gcc. If the segment selector is available, it will be automatically added. If the automatic configuration was selected, a warning is written and the global variable stack cookie is used. If a specific stack mode was selected (regular or strong) and the compiler does not support selecting the segment register, an error is emitted. Signed-off-by: Thomas Garnier --- arch/x86/Kconfig | 12 arch/x86/Makefile | 9 + arch/x86/entry/entry_32.S | 3 ++- arch/x86/entry/entry_64.S | 3 ++- arch/x86/include/asm/processor.h | 3 ++- arch/x86/include/asm/stackprotector.h | 19 ++- arch/x86/kernel/asm-offsets.c | 3 ++- arch/x86/kernel/asm-offsets_32.c | 3 ++- arch/x86/kernel/asm-offsets_64.c | 3 ++- arch/x86/kernel/cpu/common.c | 3 ++- arch/x86/kernel/head_32.S | 3 ++- arch/x86/kernel/process.c | 5 + 12 files changed, 56 insertions(+), 13 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 1434ec78e556..177e712201d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2198,6 +2198,18 @@ config RANDOMIZE_MEMORY_PHYSICAL_PADDING If unsure, leave at the default value. +config X86_GLOBAL_STACKPROTECTOR + bool "Stack cookie using a global variable" + depends on CC_STACKPROTECTOR_AUTO + default n + ---help--- + This option turns on the "stack-protector" GCC feature using a global + variable instead of a segment register. It is useful when the + compiler does not support custom segment registers when building a + position independent (PIE) binary. + + If unsure, say N + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 60135cbd905c..277ffc57ae13 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -141,6 +141,15 @@ else KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time) endif +ifdef CONFIG_X86_GLOBAL_STACKPROTECTOR +ifeq ($(call cc-option, -mstack-protector-guard=global),) +$(error Cannot use CONFIG_X86_GLOBAL_STACKPROTECTOR: \ +-mstack-protector-guard=global not supported \ +by compiler) +endif +KBUILD_CFLAGS += -mstack-protector-guard=global +endif + ifdef CONFIG_X86_X32 x32_ld_ok := $(call try-run,\ /bin/echo -e '1: .quad 1b' | \ diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index bef8e2b202a8..b7d5bc710ae7 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -239,7 +239,8 @@ ENTRY(__switch_to_asm) movl%esp, TASK_threadsp(%eax) movlTASK_threadsp(%edx), %esp -#ifdef CONFIG_CC_STACKPROTECTOR +#if defined(CONFIG_CC_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) movlTASK_stack_canary(%edx), %ebx movl%ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f9b42ca4bf60..a3d1ca4ec516 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -357,7 +357,8 @@ ENTRY(__switch_to_asm) movq%rsp, TASK_threadsp(%rdi) movqTASK_threadsp(%rsi), %rsp -#ifdef CONFIG_CC_STACKPROTECTOR +#if defined(CONFIG_CC_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) movqTASK_stack_canary(%rsi), %rbx movq%rbx, PER_CPU_VAR(irq_stack_union + stack_canary_offset) #endif diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 8162b5a24d8c..566fa5c56148 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -414,7 +414,8 @@ extern asmlinkage void ignore_sysret(void); void save_fsgs_for_kvm(void); #endif #else /* X86_64 */ -#ifdef CONFIG_CC_STACKPROTECTOR +#if defined(CONFIG_CC_STACKPROTECTOR) && \ + !defined(CONFIG_X86_GLOBAL_STACKPROTECTOR) /* * Make sure stack canary segment base is cached-aligned: * "For Intel Atom processors, avoid non zero segment base address diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 371b3a4af000..5063f57d99f5 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackpr
[PATCH v4 26/27] x86/relocs: Add option to generate 64-bit relocations
The x86 relocation tool generates a list of 32-bit signed integers. There was no need to use 64-bit integers because all addresses where above the 2G top of the memory. This change add a large-reloc option to generate 64-bit unsigned integers. It can be used when the kernel plan to go below the top 2G and 32-bit integers are not enough. Signed-off-by: Thomas Garnier --- arch/x86/tools/relocs.c| 60 +++--- arch/x86/tools/relocs.h| 4 +-- arch/x86/tools/relocs_common.c | 15 ++--- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 29283ad3950f..a29eaac6 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -13,8 +13,14 @@ static Elf_Ehdr ehdr; +#if ELF_BITS == 64 +typedef uint64_t rel_off_t; +#else +typedef uint32_t rel_off_t; +#endif + struct relocs { - uint32_t*offset; + rel_off_t *offset; unsigned long count; unsigned long size; }; @@ -685,7 +691,7 @@ static void print_absolute_relocs(void) printf("\n"); } -static void add_reloc(struct relocs *r, uint32_t offset) +static void add_reloc(struct relocs *r, rel_off_t offset) { if (r->count == r->size) { unsigned long newsize = r->size + 5; @@ -1061,26 +1067,48 @@ static void sort_relocs(struct relocs *r) qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); } -static int write32(uint32_t v, FILE *f) +static int write32(rel_off_t rel, FILE *f) { - unsigned char buf[4]; + unsigned char buf[sizeof(uint32_t)]; + uint32_t v = (uint32_t)rel; put_unaligned_le32(v, buf); - return fwrite(buf, 1, 4, f) == 4 ? 0 : -1; + return fwrite(buf, 1, sizeof(buf), f) == sizeof(buf) ? 0 : -1; } -static int write32_as_text(uint32_t v, FILE *f) +static int write32_as_text(rel_off_t rel, FILE *f) { + uint32_t v = (uint32_t)rel; return fprintf(f, "\t.long 0x%08"PRIx32"\n", v) > 0 ? 0 : -1; } -static void emit_relocs(int as_text, int use_real_mode) +static int write64(rel_off_t rel, FILE *f) +{ + unsigned char buf[sizeof(uint64_t)]; + uint64_t v = (uint64_t)rel; + + put_unaligned_le64(v, buf); + return fwrite(buf, 1, sizeof(buf), f) == sizeof(buf) ? 0 : -1; +} + +static int write64_as_text(rel_off_t rel, FILE *f) +{ + uint64_t v = (uint64_t)rel; + return fprintf(f, "\t.quad 0x%016"PRIx64"\n", v) > 0 ? 0 : -1; +} + +static void emit_relocs(int as_text, int use_real_mode, int use_large_reloc) { int i; - int (*write_reloc)(uint32_t, FILE *) = write32; + int (*write_reloc)(rel_off_t, FILE *); int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname); + if (use_large_reloc) + write_reloc = write64; + else + write_reloc = write32; + #if ELF_BITS == 64 if (!use_real_mode) do_reloc = do_reloc64; @@ -1091,6 +1119,9 @@ static void emit_relocs(int as_text, int use_real_mode) do_reloc = do_reloc32; else do_reloc = do_reloc_real; + + /* Large relocations only for 64-bit */ + use_large_reloc = 0; #endif /* Collect up the relocations */ @@ -1114,8 +1145,13 @@ static void emit_relocs(int as_text, int use_real_mode) * gas will like. */ printf(".section \".data.reloc\",\"a\"\n"); - printf(".balign 4\n"); - write_reloc = write32_as_text; + if (use_large_reloc) { + printf(".balign 8\n"); + write_reloc = write64_as_text; + } else { + printf(".balign 4\n"); + write_reloc = write32_as_text; + } } if (use_real_mode) { @@ -1183,7 +1219,7 @@ static void print_reloc_info(void) void process(FILE *fp, int use_real_mode, int as_text, int show_absolute_syms, int show_absolute_relocs, -int show_reloc_info) +int show_reloc_info, int use_large_reloc) { regex_init(use_real_mode); read_ehdr(fp); @@ -1206,5 +1242,5 @@ void process(FILE *fp, int use_real_mode, int as_text, print_reloc_info(); return; } - emit_relocs(as_text, use_real_mode); + emit_relocs(as_text, use_real_mode, use_large_reloc); } diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h index 43c83c0fd22c..3d401da59df7 100644 --- a/arch/x86/tools/relocs.h +++ b/arch/x86/tools/relocs.h @@ -31,8 +31,8 @@ enum symtype { void process_32(FILE *fp, int use_real_mode, int as_text, int show_absolute_syms, int show_a
[PATCH v4 24/27] x86/mm: Make the x86 GOT read-only
The GOT is changed during early boot when relocations are applied. Make it read-only directly. This table exists only for PIE binary. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- include/asm-generic/vmlinux.lds.h | 12 1 file changed, 12 insertions(+) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index e373e2e10f6a..e5b0710fe693 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -314,6 +314,17 @@ __end_ro_after_init = .; #endif +#ifdef CONFIG_X86_PIE +#define RO_GOT_X86 \ + .got: AT(ADDR(.got) - LOAD_OFFSET) {\ + VMLINUX_SYMBOL(__start_got) = .;\ + *(.got);\ + VMLINUX_SYMBOL(__end_got) = .; \ + } +#else +#define RO_GOT_X86 +#endif + /* * Read only Data */ @@ -370,6 +381,7 @@ __end_builtin_fw = .; \ } \ \ + RO_GOT_X86 \ TRACEDATA \ \ /* Kernel symbol table: Normal symbols */ \ -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 26/27] x86/relocs: Add option to generate 64-bit relocations
The x86 relocation tool generates a list of 32-bit signed integers. There was no need to use 64-bit integers because all addresses where above the 2G top of the memory. This change add a large-reloc option to generate 64-bit unsigned integers. It can be used when the kernel plan to go below the top 2G and 32-bit integers are not enough. Signed-off-by: Thomas Garnier --- arch/x86/tools/relocs.c| 60 +++--- arch/x86/tools/relocs.h| 4 +-- arch/x86/tools/relocs_common.c | 15 ++--- 3 files changed, 60 insertions(+), 19 deletions(-) diff --git a/arch/x86/tools/relocs.c b/arch/x86/tools/relocs.c index 29283ad3950f..a29eaac6 100644 --- a/arch/x86/tools/relocs.c +++ b/arch/x86/tools/relocs.c @@ -13,8 +13,14 @@ static Elf_Ehdr ehdr; +#if ELF_BITS == 64 +typedef uint64_t rel_off_t; +#else +typedef uint32_t rel_off_t; +#endif + struct relocs { - uint32_t*offset; + rel_off_t *offset; unsigned long count; unsigned long size; }; @@ -685,7 +691,7 @@ static void print_absolute_relocs(void) printf("\n"); } -static void add_reloc(struct relocs *r, uint32_t offset) +static void add_reloc(struct relocs *r, rel_off_t offset) { if (r->count == r->size) { unsigned long newsize = r->size + 5; @@ -1061,26 +1067,48 @@ static void sort_relocs(struct relocs *r) qsort(r->offset, r->count, sizeof(r->offset[0]), cmp_relocs); } -static int write32(uint32_t v, FILE *f) +static int write32(rel_off_t rel, FILE *f) { - unsigned char buf[4]; + unsigned char buf[sizeof(uint32_t)]; + uint32_t v = (uint32_t)rel; put_unaligned_le32(v, buf); - return fwrite(buf, 1, 4, f) == 4 ? 0 : -1; + return fwrite(buf, 1, sizeof(buf), f) == sizeof(buf) ? 0 : -1; } -static int write32_as_text(uint32_t v, FILE *f) +static int write32_as_text(rel_off_t rel, FILE *f) { + uint32_t v = (uint32_t)rel; return fprintf(f, "\t.long 0x%08"PRIx32"\n", v) > 0 ? 0 : -1; } -static void emit_relocs(int as_text, int use_real_mode) +static int write64(rel_off_t rel, FILE *f) +{ + unsigned char buf[sizeof(uint64_t)]; + uint64_t v = (uint64_t)rel; + + put_unaligned_le64(v, buf); + return fwrite(buf, 1, sizeof(buf), f) == sizeof(buf) ? 0 : -1; +} + +static int write64_as_text(rel_off_t rel, FILE *f) +{ + uint64_t v = (uint64_t)rel; + return fprintf(f, "\t.quad 0x%016"PRIx64"\n", v) > 0 ? 0 : -1; +} + +static void emit_relocs(int as_text, int use_real_mode, int use_large_reloc) { int i; - int (*write_reloc)(uint32_t, FILE *) = write32; + int (*write_reloc)(rel_off_t, FILE *); int (*do_reloc)(struct section *sec, Elf_Rel *rel, Elf_Sym *sym, const char *symname); + if (use_large_reloc) + write_reloc = write64; + else + write_reloc = write32; + #if ELF_BITS == 64 if (!use_real_mode) do_reloc = do_reloc64; @@ -1091,6 +1119,9 @@ static void emit_relocs(int as_text, int use_real_mode) do_reloc = do_reloc32; else do_reloc = do_reloc_real; + + /* Large relocations only for 64-bit */ + use_large_reloc = 0; #endif /* Collect up the relocations */ @@ -1114,8 +1145,13 @@ static void emit_relocs(int as_text, int use_real_mode) * gas will like. */ printf(".section \".data.reloc\",\"a\"\n"); - printf(".balign 4\n"); - write_reloc = write32_as_text; + if (use_large_reloc) { + printf(".balign 8\n"); + write_reloc = write64_as_text; + } else { + printf(".balign 4\n"); + write_reloc = write32_as_text; + } } if (use_real_mode) { @@ -1183,7 +1219,7 @@ static void print_reloc_info(void) void process(FILE *fp, int use_real_mode, int as_text, int show_absolute_syms, int show_absolute_relocs, -int show_reloc_info) +int show_reloc_info, int use_large_reloc) { regex_init(use_real_mode); read_ehdr(fp); @@ -1206,5 +1242,5 @@ void process(FILE *fp, int use_real_mode, int as_text, print_reloc_info(); return; } - emit_relocs(as_text, use_real_mode); + emit_relocs(as_text, use_real_mode, use_large_reloc); } diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h index 43c83c0fd22c..3d401da59df7 100644 --- a/arch/x86/tools/relocs.h +++ b/arch/x86/tools/relocs.h @@ -31,8 +31,8 @@ enum symtype { void process_32(FILE *fp, int use_real_mode, int as_text, int show_absolute_syms, int show_a
[PATCH v4 24/27] x86/mm: Make the x86 GOT read-only
The GOT is changed during early boot when relocations are applied. Make it read-only directly. This table exists only for PIE binary. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- include/asm-generic/vmlinux.lds.h | 12 1 file changed, 12 insertions(+) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index e373e2e10f6a..e5b0710fe693 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -314,6 +314,17 @@ __end_ro_after_init = .; #endif +#ifdef CONFIG_X86_PIE +#define RO_GOT_X86 \ + .got: AT(ADDR(.got) - LOAD_OFFSET) {\ + VMLINUX_SYMBOL(__start_got) = .;\ + *(.got);\ + VMLINUX_SYMBOL(__end_got) = .; \ + } +#else +#define RO_GOT_X86 +#endif + /* * Read only Data */ @@ -370,6 +381,7 @@ __end_builtin_fw = .; \ } \ \ + RO_GOT_X86 \ TRACEDATA \ \ /* Kernel symbol table: Normal symbols */ \ -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 25/27] x86/pie: Add option to build the kernel as PIE
Add the CONFIG_X86_PIE option which builds the kernel as a Position Independent Executable (PIE). The kernel is currently build with the mcmodel=kernel option which forces it to stay on the top 2G of the virtual address space. With PIE, the kernel will be able to move below the current limit. The --emit-relocs linker option was kept instead of using -pie to limit the impact on mapped sections. Any incompatible relocation will be catch by the arch/x86/tools/relocs binary at compile time. If segment based stack cookies are enabled, try to use the compiler option to select the segment register. If not available, automatically enabled global stack cookie in auto mode. Otherwise, recommend compiler update or global stack cookie option. Performance/Size impact: Size of vmlinux (Default configuration): File size: - PIE disabled: +0.18% - PIE enabled: -1.977% (less relocations) .text section: - PIE disabled: same - PIE enabled: same Size of vmlinux (Ubuntu configuration): File size: - PIE disabled: +0.21% - PIE enabled: +10% .text section: - PIE disabled: same - PIE enabled: +0.001% The size increase is mainly due to not having access to the 32-bit signed relocation that can be used with mcmodel=kernel. A small part is due to reduced optimization for PIE code. This bug [1] was opened with gcc to provide a better code generation for kernel PIE. Hackbench (50% and 1600% on thread/process for pipe/sockets): - PIE disabled: no significant change (avg -/+ 0.5% on latest test). - PIE enabled: between -1% to +1% in average (default and Ubuntu config). Kernbench (average of 10 Half and Optimal runs): Elapsed Time: - PIE disabled: no significant change (avg -0.5%) - PIE enabled: average -0.5% to +0.5% System Time: - PIE disabled: no significant change (avg -0.1%) - PIE enabled: average -0.4% to +0.4%. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82303 Signed-off-by: Thomas Garnier merge pie --- arch/x86/Kconfig | 8 arch/x86/Makefile | 45 - 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 94a00d81ec18..47cf21e452d2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2214,6 +2214,14 @@ config X86_GLOBAL_STACKPROTECTOR If unsure, say N +config X86_PIE + bool + depends on X86_64 + select DEFAULT_HIDDEN + select WEAK_PROVIDE_HIDDEN + select DYNAMIC_MODULE_BASE + select MODULE_REL_CRCS if MODVERSIONS + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 20bb6cbd8938..c92bcca4400c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -60,6 +60,8 @@ endif KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow KBUILD_CFLAGS += $(call cc-option,-mno-avx,) +stackglobal := $(call cc-option-yn, -mstack-protector-guard=global) + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -135,7 +137,48 @@ else KBUILD_CFLAGS += -mno-red-zone ifdef CONFIG_X86_PIE +KBUILD_CFLAGS += -fPIE KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/x86/kernel/module.lds + +# Relax relocation in both CFLAGS and LDFLAGS to support older compilers +KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) +LDFLAGS_vmlinux += $(call ld-option,--no-relax) +KBUILD_LDFLAGS_MODULE += $(call ld-option,--no-relax) + +# Stack validation is not yet support due to self-referenced switches +ifdef CONFIG_STACK_VALIDATION +$(warning CONFIG_STACK_VALIDATION is not yet supported for x86_64 pie \ + build.) +SKIP_STACK_VALIDATION := 1 +export SKIP_STACK_VALIDATION +endif + +ifndef CONFIG_CC_STACKPROTECTOR_NONE +ifndef CONFIG_X86_GLOBAL_STACKPROTECTOR +stackseg-flag := -mstack-protector-guard-reg=%gs +ifeq ($(call cc-option-yn,$(stackseg-flag)),n) +# Try to enable global stack cookie if possible +ifeq ($(stackglobal), y) +$(warning Cannot use CONFIG_CC_STACKPROTECTOR_* while \ +building a position independent kernel. \ +Default to global stack protector \ +(CONFIG_X86_GLOBAL_STACKPROTECTOR).) +CONFIG_X86_GLOBAL_STACKPROTECTOR := y +KBUILD_CFLAGS += -DCONFIG_X86_GLOBAL_STACKPROTECTOR +KBUILD_AFLAGS += -DCONFIG_X86_GLOBAL_STACKPROTECTOR +else +$(error echo Cannot use \ +CONFIG_CC_STACKPROTECTOR_(REGULAR|STRONG|AUTO) \ +while building a position independent binary. \ +Update your compiler or use \ +CONFIG_X86_GLOBAL_STAC
[PATCH v4 27/27] x86/kaslr: Add option to extend KASLR range from 1GB to 3GB
Add a new CONFIG_RANDOMIZE_BASE_LARGE option to benefit from PIE support. It increases the KASLR range from 1GB to 3GB. The new range stars at 0x just above the EFI memory region. This option is off by default. The boot code is adapted to create the appropriate page table spanning three PUD pages. The relocation table uses 64-bit integers generated with the updated relocation tool with the large-reloc option. Signed-off-by: Thomas Garnier --- arch/x86/Kconfig | 21 + arch/x86/boot/compressed/Makefile| 5 + arch/x86/boot/compressed/misc.c | 10 +- arch/x86/include/asm/page_64_types.h | 9 + arch/x86/kernel/head64.c | 15 --- arch/x86/kernel/head_64.S| 11 ++- 6 files changed, 66 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 47cf21e452d2..10eea5f440de 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -,6 +,27 @@ config X86_PIE select DYNAMIC_MODULE_BASE select MODULE_REL_CRCS if MODVERSIONS +config RANDOMIZE_BASE_LARGE + bool "Increase the randomization range of the kernel image" + depends on X86_64 && RANDOMIZE_BASE + select X86_PIE + select X86_MODULE_PLTS if MODULES + default n + ---help--- + Build the kernel as a Position Independent Executable (PIE) and + increase the available randomization range from 1GB to 3GB. + + This option impacts performance on kernel CPU intensive workloads up + to 10% due to PIE generated code. Impact on user-mode processes and + typical usage would be significantly less (0.50% when you build the + kernel). + + The kernel and modules will generate slightly more assembly (1 to 2% + increase on the .text sections). The vmlinux binary will be + significantly smaller due to less relocations. + + If unsure say N + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fa42f895fdde..8497ebd5e078 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -116,7 +116,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs +# Large randomization require bigger relocation table +ifeq ($(CONFIG_RANDOMIZE_BASE_LARGE),y) +CMD_RELOCS = arch/x86/tools/relocs --large-reloc +else CMD_RELOCS = arch/x86/tools/relocs +endif quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< $(obj)/vmlinux.relocs: vmlinux FORCE diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 8dd1d5ccae58..28d17bd5bad8 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -171,10 +171,18 @@ void __puthex(unsigned long value) } #if CONFIG_X86_NEED_RELOCS + +/* Large randomization go lower than -2G and use large relocation table */ +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +typedef long rel_t; +#else +typedef int rel_t; +#endif + static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) { - int *reloc; + rel_t *reloc; unsigned long delta, map, ptr; unsigned long min_addr = (unsigned long)output; unsigned long max_addr = min_addr + (VO___bss_start - VO__text); diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 2c5a966dc222..85ea681421d2 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -46,7 +46,11 @@ #define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +#define __START_KERNEL_map _AC(0x, UL) +#else #define __START_KERNEL_map _AC(0x8000, UL) +#endif /* CONFIG_RANDOMIZE_BASE_LARGE */ /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ @@ -64,9 +68,14 @@ * 512MiB by default, leaving 1.5GiB for modules once the page tables * are fully set up. If kernel ASLR is configured, it can extend the * kernel page table mapping, reducing the size of the modules area. + * On PIE, we relocate the binary 2G lower so add this extra space. */ #if defined(CONFIG_RANDOMIZE_BASE) +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +#define KERNEL_IMAGE_SIZE (_AC(3, UL) * 1024 * 1024 * 1024) +#else #define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) +#endif #else #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 3a1ce822e1c0..e18cc23b9d99 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -63,6 +63,7 @@ EXPORT_SYM
[PATCH v4 19/27] kvm: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. The new __ASM_MOVABS macro is used to get the address of a symbol on both 32 and 64-bit with PIE support. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/kvm_host.h | 8 ++-- arch/x86/kernel/kvm.c | 6 -- arch/x86/kvm/svm.c | 4 ++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 130874077c93..6afb2161263d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1389,9 +1389,13 @@ asmlinkage void kvm_spurious_fault(void); ".pushsection .fixup, \"ax\" \n" \ "667: \n\t" \ cleanup_insn "\n\t" \ - "cmpb $0, kvm_rebooting \n\t" \ + "cmpb $0, kvm_rebooting" __ASM_SEL(,(%%rip)) " \n\t" \ "jne 668b \n\t" \ - __ASM_SIZE(push) " $666b \n\t"\ + __ASM_SIZE(push) "$0 \n\t" \ + __ASM_SIZE(push) "%%" _ASM_AX " \n\t" \ + _ASM_MOVABS " $666b, %%" _ASM_AX "\n\t" \ + _ASM_MOV " %%" _ASM_AX ", " __ASM_SEL(4,8) "(%%" _ASM_SP ") \n\t" \ + __ASM_SIZE(pop) "%%" _ASM_AX " \n\t"\ "call kvm_spurious_fault \n\t"\ ".popsection \n\t" \ _ASM_EXTABLE(666b, 667b) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5b2300b818af..38716c409a98 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -726,8 +726,10 @@ asm( ".global __raw_callee_save___kvm_vcpu_is_preempted;" ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" "__raw_callee_save___kvm_vcpu_is_preempted:" -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"leaq __per_cpu_offset(%rip), %rax;" +"movq (%rax,%rdi,8), %rax;" +"addq " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rip), %rax;" +"cmpb $0, (%rax);" "setne %al;" "ret;" ".popsection"); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b2e7140f23ea..bf09d1993d8d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -707,12 +707,12 @@ static u32 svm_msrpm_offset(u32 msr) static inline void clgi(void) { - asm volatile (__ex(SVM_CLGI)); + asm volatile (__ex(SVM_CLGI) : :); } static inline void stgi(void) { - asm volatile (__ex(SVM_STGI)); + asm volatile (__ex(SVM_STGI) : :); } static inline void invlpga(unsigned long addr, u32 asid) -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 25/27] x86/pie: Add option to build the kernel as PIE
Add the CONFIG_X86_PIE option which builds the kernel as a Position Independent Executable (PIE). The kernel is currently build with the mcmodel=kernel option which forces it to stay on the top 2G of the virtual address space. With PIE, the kernel will be able to move below the current limit. The --emit-relocs linker option was kept instead of using -pie to limit the impact on mapped sections. Any incompatible relocation will be catch by the arch/x86/tools/relocs binary at compile time. If segment based stack cookies are enabled, try to use the compiler option to select the segment register. If not available, automatically enabled global stack cookie in auto mode. Otherwise, recommend compiler update or global stack cookie option. Performance/Size impact: Size of vmlinux (Default configuration): File size: - PIE disabled: +0.18% - PIE enabled: -1.977% (less relocations) .text section: - PIE disabled: same - PIE enabled: same Size of vmlinux (Ubuntu configuration): File size: - PIE disabled: +0.21% - PIE enabled: +10% .text section: - PIE disabled: same - PIE enabled: +0.001% The size increase is mainly due to not having access to the 32-bit signed relocation that can be used with mcmodel=kernel. A small part is due to reduced optimization for PIE code. This bug [1] was opened with gcc to provide a better code generation for kernel PIE. Hackbench (50% and 1600% on thread/process for pipe/sockets): - PIE disabled: no significant change (avg -/+ 0.5% on latest test). - PIE enabled: between -1% to +1% in average (default and Ubuntu config). Kernbench (average of 10 Half and Optimal runs): Elapsed Time: - PIE disabled: no significant change (avg -0.5%) - PIE enabled: average -0.5% to +0.5% System Time: - PIE disabled: no significant change (avg -0.1%) - PIE enabled: average -0.4% to +0.4%. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82303 Signed-off-by: Thomas Garnier merge pie --- arch/x86/Kconfig | 8 arch/x86/Makefile | 45 - 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 94a00d81ec18..47cf21e452d2 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2214,6 +2214,14 @@ config X86_GLOBAL_STACKPROTECTOR If unsure, say N +config X86_PIE + bool + depends on X86_64 + select DEFAULT_HIDDEN + select WEAK_PROVIDE_HIDDEN + select DYNAMIC_MODULE_BASE + select MODULE_REL_CRCS if MODVERSIONS + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 20bb6cbd8938..c92bcca4400c 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -60,6 +60,8 @@ endif KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow KBUILD_CFLAGS += $(call cc-option,-mno-avx,) +stackglobal := $(call cc-option-yn, -mstack-protector-guard=global) + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -135,7 +137,48 @@ else KBUILD_CFLAGS += -mno-red-zone ifdef CONFIG_X86_PIE +KBUILD_CFLAGS += -fPIE KBUILD_LDFLAGS_MODULE += -T $(srctree)/arch/x86/kernel/module.lds + +# Relax relocation in both CFLAGS and LDFLAGS to support older compilers +KBUILD_CFLAGS += $(call cc-option,-Wa$(comma)-mrelax-relocations=no) +LDFLAGS_vmlinux += $(call ld-option,--no-relax) +KBUILD_LDFLAGS_MODULE += $(call ld-option,--no-relax) + +# Stack validation is not yet support due to self-referenced switches +ifdef CONFIG_STACK_VALIDATION +$(warning CONFIG_STACK_VALIDATION is not yet supported for x86_64 pie \ + build.) +SKIP_STACK_VALIDATION := 1 +export SKIP_STACK_VALIDATION +endif + +ifndef CONFIG_CC_STACKPROTECTOR_NONE +ifndef CONFIG_X86_GLOBAL_STACKPROTECTOR +stackseg-flag := -mstack-protector-guard-reg=%gs +ifeq ($(call cc-option-yn,$(stackseg-flag)),n) +# Try to enable global stack cookie if possible +ifeq ($(stackglobal), y) +$(warning Cannot use CONFIG_CC_STACKPROTECTOR_* while \ +building a position independent kernel. \ +Default to global stack protector \ +(CONFIG_X86_GLOBAL_STACKPROTECTOR).) +CONFIG_X86_GLOBAL_STACKPROTECTOR := y +KBUILD_CFLAGS += -DCONFIG_X86_GLOBAL_STACKPROTECTOR +KBUILD_AFLAGS += -DCONFIG_X86_GLOBAL_STACKPROTECTOR +else +$(error echo Cannot use \ +CONFIG_CC_STACKPROTECTOR_(REGULAR|STRONG|AUTO) \ +while building a position independent binary. \ +Update your compiler or use \ +CONFIG_X86_GLOBAL_STAC
[PATCH v4 27/27] x86/kaslr: Add option to extend KASLR range from 1GB to 3GB
Add a new CONFIG_RANDOMIZE_BASE_LARGE option to benefit from PIE support. It increases the KASLR range from 1GB to 3GB. The new range stars at 0x just above the EFI memory region. This option is off by default. The boot code is adapted to create the appropriate page table spanning three PUD pages. The relocation table uses 64-bit integers generated with the updated relocation tool with the large-reloc option. Signed-off-by: Thomas Garnier --- arch/x86/Kconfig | 21 + arch/x86/boot/compressed/Makefile| 5 + arch/x86/boot/compressed/misc.c | 10 +- arch/x86/include/asm/page_64_types.h | 9 + arch/x86/kernel/head64.c | 15 --- arch/x86/kernel/head_64.S| 11 ++- 6 files changed, 66 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 47cf21e452d2..10eea5f440de 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -,6 +,27 @@ config X86_PIE select DYNAMIC_MODULE_BASE select MODULE_REL_CRCS if MODVERSIONS +config RANDOMIZE_BASE_LARGE + bool "Increase the randomization range of the kernel image" + depends on X86_64 && RANDOMIZE_BASE + select X86_PIE + select X86_MODULE_PLTS if MODULES + default n + ---help--- + Build the kernel as a Position Independent Executable (PIE) and + increase the available randomization range from 1GB to 3GB. + + This option impacts performance on kernel CPU intensive workloads up + to 10% due to PIE generated code. Impact on user-mode processes and + typical usage would be significantly less (0.50% when you build the + kernel). + + The kernel and modules will generate slightly more assembly (1 to 2% + increase on the .text sections). The vmlinux binary will be + significantly smaller due to less relocations. + + If unsure say N + config HOTPLUG_CPU bool "Support for hot-pluggable CPUs" depends on SMP diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index fa42f895fdde..8497ebd5e078 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -116,7 +116,12 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += $(patsubst $(obj)/%,%,$(vmlinux-objs-y)) vmlinux.bin.all vmlinux.relocs +# Large randomization require bigger relocation table +ifeq ($(CONFIG_RANDOMIZE_BASE_LARGE),y) +CMD_RELOCS = arch/x86/tools/relocs --large-reloc +else CMD_RELOCS = arch/x86/tools/relocs +endif quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(CMD_RELOCS) $< > $@;$(CMD_RELOCS) --abs-relocs $< $(obj)/vmlinux.relocs: vmlinux FORCE diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index 8dd1d5ccae58..28d17bd5bad8 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -171,10 +171,18 @@ void __puthex(unsigned long value) } #if CONFIG_X86_NEED_RELOCS + +/* Large randomization go lower than -2G and use large relocation table */ +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +typedef long rel_t; +#else +typedef int rel_t; +#endif + static void handle_relocations(void *output, unsigned long output_len, unsigned long virt_addr) { - int *reloc; + rel_t *reloc; unsigned long delta, map, ptr; unsigned long min_addr = (unsigned long)output; unsigned long max_addr = min_addr + (VO___bss_start - VO__text); diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 2c5a966dc222..85ea681421d2 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -46,7 +46,11 @@ #define __PAGE_OFFSET __PAGE_OFFSET_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +#define __START_KERNEL_map _AC(0x, UL) +#else #define __START_KERNEL_map _AC(0x8000, UL) +#endif /* CONFIG_RANDOMIZE_BASE_LARGE */ /* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ @@ -64,9 +68,14 @@ * 512MiB by default, leaving 1.5GiB for modules once the page tables * are fully set up. If kernel ASLR is configured, it can extend the * kernel page table mapping, reducing the size of the modules area. + * On PIE, we relocate the binary 2G lower so add this extra space. */ #if defined(CONFIG_RANDOMIZE_BASE) +#ifdef CONFIG_RANDOMIZE_BASE_LARGE +#define KERNEL_IMAGE_SIZE (_AC(3, UL) * 1024 * 1024 * 1024) +#else #define KERNEL_IMAGE_SIZE (1024 * 1024 * 1024) +#endif #else #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 3a1ce822e1c0..e18cc23b9d99 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -63,6 +63,7 @@ EXPORT_SYM
[PATCH v4 19/27] kvm: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. The new __ASM_MOVABS macro is used to get the address of a symbol on both 32 and 64-bit with PIE support. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/include/asm/kvm_host.h | 8 ++-- arch/x86/kernel/kvm.c | 6 -- arch/x86/kvm/svm.c | 4 ++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 130874077c93..6afb2161263d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1389,9 +1389,13 @@ asmlinkage void kvm_spurious_fault(void); ".pushsection .fixup, \"ax\" \n" \ "667: \n\t" \ cleanup_insn "\n\t" \ - "cmpb $0, kvm_rebooting \n\t" \ + "cmpb $0, kvm_rebooting" __ASM_SEL(,(%%rip)) " \n\t" \ "jne 668b \n\t" \ - __ASM_SIZE(push) " $666b \n\t"\ + __ASM_SIZE(push) "$0 \n\t" \ + __ASM_SIZE(push) "%%" _ASM_AX " \n\t" \ + _ASM_MOVABS " $666b, %%" _ASM_AX "\n\t" \ + _ASM_MOV " %%" _ASM_AX ", " __ASM_SEL(4,8) "(%%" _ASM_SP ") \n\t" \ + __ASM_SIZE(pop) "%%" _ASM_AX " \n\t"\ "call kvm_spurious_fault \n\t"\ ".popsection \n\t" \ _ASM_EXTABLE(666b, 667b) diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5b2300b818af..38716c409a98 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -726,8 +726,10 @@ asm( ".global __raw_callee_save___kvm_vcpu_is_preempted;" ".type __raw_callee_save___kvm_vcpu_is_preempted, @function;" "__raw_callee_save___kvm_vcpu_is_preempted:" -"movq __per_cpu_offset(,%rdi,8), %rax;" -"cmpb $0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);" +"leaq __per_cpu_offset(%rip), %rax;" +"movq (%rax,%rdi,8), %rax;" +"addq " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rip), %rax;" +"cmpb $0, (%rax);" "setne %al;" "ret;" ".popsection"); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index b2e7140f23ea..bf09d1993d8d 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -707,12 +707,12 @@ static u32 svm_msrpm_offset(u32 msr) static inline void clgi(void) { - asm volatile (__ex(SVM_CLGI)); + asm volatile (__ex(SVM_CLGI) : :); } static inline void stgi(void) { - asm volatile (__ex(SVM_STGI)); + asm volatile (__ex(SVM_STGI) : :); } static inline void invlpga(unsigned long addr, u32 asid) -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 16/27] compiler: Option to add PROVIDE_HIDDEN replacement for weak symbols
Provide an option to have a PROVIDE_HIDDEN (linker script) entry for each weak symbol. This option solves an error in x86_64 where the linker optimizes PIE generated code to be non-PIE because --emit-relocs was used instead of -pie (to reduce dynamic relocations). Signed-off-by: Thomas Garnier --- init/Kconfig| 7 +++ scripts/link-vmlinux.sh | 14 ++ 2 files changed, 21 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index f16247675f84..20e2bf2ae4b4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1954,6 +1954,13 @@ config ASN1 inform it as to what tags are to be expected in a stream and what functions to call on what tags. +config WEAK_PROVIDE_HIDDEN + bool + help + Generate linker script PROVIDE_HIDDEN entries for all weak symbols. It + allows to prevent non-PIE code being replaced by the linker if the + emit-relocs option is used instead of PIE (useful for x86_64 PIE). + source "kernel/Kconfig.locks" config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 4bf811c09f59..f5d31119b9d7 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -142,6 +142,17 @@ kallsyms() ${CC} ${aflags} -c -o ${2} ${afile} } +gen_weak_provide_hidden() +{ +if [ -n "${CONFIG_WEAK_PROVIDE_HIDDEN}" ]; then +local pattern="s/^\s\+ w \(\w\+\)$/PROVIDE_HIDDEN(\1 = .);/gp" +echo -e "SECTIONS {\n. = _end;" > .tmp_vmlinux_hiddenld +${NM} ${1} | sed -n "${pattern}" >> .tmp_vmlinux_hiddenld +echo "}" >> .tmp_vmlinux_hiddenld +LDFLAGS_vmlinux="${LDFLAGS_vmlinux} -T .tmp_vmlinux_hiddenld" +fi +} + # Create map file with all symbols from ${1} # See mksymap for additional details mksysmap() @@ -226,6 +237,9 @@ modpost_link vmlinux.o # modpost vmlinux.o to check for section mismatches ${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o +# Generate weak linker script +gen_weak_provide_hidden vmlinux.o + kallsymso="" kallsyms_vmlinux="" if [ -n "${CONFIG_KALLSYMS}" ]; then -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 15/27] compiler: Option to default to hidden symbols
Provide an option to default visibility to hidden except for key symbols. This option is disabled by default and will be used by x86_64 PIE support to remove errors between compilation units. The default visibility is also enabled for external symbols that are compared as they maybe equals (start/end of sections). In this case, older versions of GCC will remove the comparison if the symbols are hidden. This issue exists at least on gcc 4.9 and before. Signed-off-by: Thomas Garnier --- arch/x86/boot/boot.h | 2 +- arch/x86/include/asm/setup.h | 2 +- arch/x86/kernel/cpu/microcode/core.c | 4 ++-- drivers/base/firmware_loader/main.c | 4 ++-- include/asm-generic/sections.h | 6 ++ include/linux/compiler.h | 7 +++ init/Kconfig | 7 +++ kernel/kallsyms.c| 16 kernel/trace/trace.h | 4 ++-- lib/dynamic_debug.c | 4 ++-- 10 files changed, 38 insertions(+), 18 deletions(-) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index ef5a9cc66fb8..d726c35bdd96 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -193,7 +193,7 @@ static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len) } /* Heap -- available for dynamic lists. */ -extern char _end[]; +extern char _end[] __default_visibility; extern char *HEAP; extern char *heap_end; #define RESET_HEAP() ((void *)( HEAP = _end )) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ae13bc974416..083a6e99b884 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -68,7 +68,7 @@ static inline void x86_ce4100_early_setup(void) { } * This is set up by the setup-routine at boot-time */ extern struct boot_params boot_params; -extern char _text[]; +extern char _text[] __default_visibility; static inline bool kaslr_enabled(void) { diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 77e201301528..6a4f5d9d7eb6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -149,8 +149,8 @@ static bool __init check_loader_disabled_bsp(void) return *res; } -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; +extern struct builtin_fw __start_builtin_fw[] __default_visibility; +extern struct builtin_fw __end_builtin_fw[] __default_visibility; bool get_builtin_firmware(struct cpio_data *cd, const char *name) { diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 0943e7065e0e..2ffd019af2d4 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -94,8 +94,8 @@ static struct firmware_cache fw_cache; #ifdef CONFIG_FW_LOADER -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; +extern struct builtin_fw __start_builtin_fw[] __default_visibility; +extern struct builtin_fw __end_builtin_fw[] __default_visibility; static void fw_copy_to_prealloc_buf(struct firmware *fw, void *buf, size_t size) diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 849cd8eb5ca0..0a0e23405ddd 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -32,6 +32,9 @@ * __softirqentry_text_start, __softirqentry_text_end * __start_opd, __end_opd */ +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility push(default) +#endif extern char _text[], _stext[], _etext[]; extern char _data[], _sdata[], _edata[]; extern char __bss_start[], __bss_stop[]; @@ -49,6 +52,9 @@ extern char __start_once[], __end_once[]; /* Start and end of .ctors section - used for constructor calls. */ extern char __ctors_start[], __ctors_end[]; +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility pop +#endif /* Start and end of .opd section - used for function descriptors. */ extern char __start_opd[], __end_opd[]; diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ab4711c63601..a9ac84e37af9 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -278,6 +278,13 @@ unsigned long read_word_at_a_time(const void *addr) __u.__val; \ }) +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility push(hidden) +#define __default_visibility __attribute__((visibility ("default"))) +#else +#define __default_visibility +#endif + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/init/Kconfig b/init/Kconfig index e4acab9f9fd1..f16247675f84 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1694,6 +1694,13 @@ config PROFILING config TRACEPOINTS bool +# +# Default to hidden visibility for all symbols. +# Useful for Position Independent Code to reduce global references. +# +config DEFAULT_HIDDEN + bool + source &qu
[PATCH v4 14/27] x86/percpu: Adapt percpu for PIE support
Perpcu uses a clever design where the .percu ELF section has a virtual address of zero and the relocation code avoid relocating specific symbols. It makes the code simple and easily adaptable with or without SMP support. This design is incompatible with PIE because generated code always try to access the zero virtual address relative to the default mapping address. It becomes impossible when KASLR is configured to go below -2G. This patch solves this problem by removing the zero mapping and adapting the GS base to be relative to the expected address. These changes are done only when PIE is enabled. The original implementation is kept as-is by default. The assembly and PER_CPU macros are changed to use relative references when PIE is enabled. The KALLSYMS_ABSOLUTE_PERCPU configuration is disabled with PIE given percpu symbols are not absolute in this case. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/entry/calling.h | 2 +- arch/x86/entry/entry_64.S| 4 ++-- arch/x86/include/asm/percpu.h| 25 +++-- arch/x86/include/asm/processor.h | 4 +++- arch/x86/kernel/head_64.S| 4 arch/x86/kernel/setup_percpu.c | 5 - arch/x86/kernel/vmlinux.lds.S| 13 +++-- arch/x86/lib/cmpxchg16b_emu.S| 8 arch/x86/xen/xen-asm.S | 12 ++-- init/Kconfig | 2 +- 10 files changed, 55 insertions(+), 24 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 352e70cd33e8..d6c60e6b598f 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -218,7 +218,7 @@ For 32-bit we have the following conventions - kernel is built with .endm #define THIS_CPU_user_pcid_flush_mask \ - PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask + PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask) .macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1cbf4c3616a8..f9b42ca4bf60 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -359,7 +359,7 @@ ENTRY(__switch_to_asm) #ifdef CONFIG_CC_STACKPROTECTOR movqTASK_stack_canary(%rsi), %rbx - movq%rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset + movq%rbx, PER_CPU_VAR(irq_stack_union + stack_canary_offset) #endif #ifdef CONFIG_RETPOLINE @@ -897,7 +897,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw + (TSS_ist + ((x) - 1) * 8)) .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index a06b07399d17..7d1271b536ea 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -5,9 +5,11 @@ #ifdef CONFIG_X86_64 #define __percpu_seg gs #define __percpu_mov_opmovq +#define __percpu_rel (%rip) #else #define __percpu_seg fs #define __percpu_mov_opmovl +#define __percpu_rel #endif #ifdef __ASSEMBLY__ @@ -28,10 +30,14 @@ #define PER_CPU(var, reg) \ __percpu_mov_op %__percpu_seg:this_cpu_off, reg;\ lea var(reg), reg -#define PER_CPU_VAR(var) %__percpu_seg:var +/* Compatible with Position Independent Code */ +#define PER_CPU_VAR(var) %__percpu_seg:(var)##__percpu_rel +/* Rare absolute reference */ +#define PER_CPU_VAR_ABS(var) %__percpu_seg:var #else /* ! SMP */ #define PER_CPU(var, reg) __percpu_mov_op $var, reg -#define PER_CPU_VAR(var) var +#define PER_CPU_VAR(var) (var)##__percpu_rel +#define PER_CPU_VAR_ABS(var) var #endif /* SMP */ #ifdef CONFIG_X86_64_SMP @@ -209,27 +215,34 @@ do { \ pfo_ret__; \ }) +/* Position Independent code uses relative addresses only */ +#ifdef CONFIG_X86_PIE +#define __percpu_stable_arg __percpu_arg(a1) +#else +#define __percpu_stable_arg __percpu_arg(P1) +#endif + #define percpu_stable_op(op, var) \ ({ \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ - asm(op "b "__percpu_arg(P1)",%0"\ + asm(op "b "__percpu_stable_ar
[PATCH v4 16/27] compiler: Option to add PROVIDE_HIDDEN replacement for weak symbols
Provide an option to have a PROVIDE_HIDDEN (linker script) entry for each weak symbol. This option solves an error in x86_64 where the linker optimizes PIE generated code to be non-PIE because --emit-relocs was used instead of -pie (to reduce dynamic relocations). Signed-off-by: Thomas Garnier --- init/Kconfig| 7 +++ scripts/link-vmlinux.sh | 14 ++ 2 files changed, 21 insertions(+) diff --git a/init/Kconfig b/init/Kconfig index f16247675f84..20e2bf2ae4b4 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1954,6 +1954,13 @@ config ASN1 inform it as to what tags are to be expected in a stream and what functions to call on what tags. +config WEAK_PROVIDE_HIDDEN + bool + help + Generate linker script PROVIDE_HIDDEN entries for all weak symbols. It + allows to prevent non-PIE code being replaced by the linker if the + emit-relocs option is used instead of PIE (useful for x86_64 PIE). + source "kernel/Kconfig.locks" config ARCH_HAS_SYNC_CORE_BEFORE_USERMODE diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh index 4bf811c09f59..f5d31119b9d7 100755 --- a/scripts/link-vmlinux.sh +++ b/scripts/link-vmlinux.sh @@ -142,6 +142,17 @@ kallsyms() ${CC} ${aflags} -c -o ${2} ${afile} } +gen_weak_provide_hidden() +{ +if [ -n "${CONFIG_WEAK_PROVIDE_HIDDEN}" ]; then +local pattern="s/^\s\+ w \(\w\+\)$/PROVIDE_HIDDEN(\1 = .);/gp" +echo -e "SECTIONS {\n. = _end;" > .tmp_vmlinux_hiddenld +${NM} ${1} | sed -n "${pattern}" >> .tmp_vmlinux_hiddenld +echo "}" >> .tmp_vmlinux_hiddenld +LDFLAGS_vmlinux="${LDFLAGS_vmlinux} -T .tmp_vmlinux_hiddenld" +fi +} + # Create map file with all symbols from ${1} # See mksymap for additional details mksysmap() @@ -226,6 +237,9 @@ modpost_link vmlinux.o # modpost vmlinux.o to check for section mismatches ${MAKE} -f "${srctree}/scripts/Makefile.modpost" vmlinux.o +# Generate weak linker script +gen_weak_provide_hidden vmlinux.o + kallsymso="" kallsyms_vmlinux="" if [ -n "${CONFIG_KALLSYMS}" ]; then -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 15/27] compiler: Option to default to hidden symbols
Provide an option to default visibility to hidden except for key symbols. This option is disabled by default and will be used by x86_64 PIE support to remove errors between compilation units. The default visibility is also enabled for external symbols that are compared as they maybe equals (start/end of sections). In this case, older versions of GCC will remove the comparison if the symbols are hidden. This issue exists at least on gcc 4.9 and before. Signed-off-by: Thomas Garnier --- arch/x86/boot/boot.h | 2 +- arch/x86/include/asm/setup.h | 2 +- arch/x86/kernel/cpu/microcode/core.c | 4 ++-- drivers/base/firmware_loader/main.c | 4 ++-- include/asm-generic/sections.h | 6 ++ include/linux/compiler.h | 7 +++ init/Kconfig | 7 +++ kernel/kallsyms.c| 16 kernel/trace/trace.h | 4 ++-- lib/dynamic_debug.c | 4 ++-- 10 files changed, 38 insertions(+), 18 deletions(-) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index ef5a9cc66fb8..d726c35bdd96 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -193,7 +193,7 @@ static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len) } /* Heap -- available for dynamic lists. */ -extern char _end[]; +extern char _end[] __default_visibility; extern char *HEAP; extern char *heap_end; #define RESET_HEAP() ((void *)( HEAP = _end )) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index ae13bc974416..083a6e99b884 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -68,7 +68,7 @@ static inline void x86_ce4100_early_setup(void) { } * This is set up by the setup-routine at boot-time */ extern struct boot_params boot_params; -extern char _text[]; +extern char _text[] __default_visibility; static inline bool kaslr_enabled(void) { diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 77e201301528..6a4f5d9d7eb6 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -149,8 +149,8 @@ static bool __init check_loader_disabled_bsp(void) return *res; } -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; +extern struct builtin_fw __start_builtin_fw[] __default_visibility; +extern struct builtin_fw __end_builtin_fw[] __default_visibility; bool get_builtin_firmware(struct cpio_data *cd, const char *name) { diff --git a/drivers/base/firmware_loader/main.c b/drivers/base/firmware_loader/main.c index 0943e7065e0e..2ffd019af2d4 100644 --- a/drivers/base/firmware_loader/main.c +++ b/drivers/base/firmware_loader/main.c @@ -94,8 +94,8 @@ static struct firmware_cache fw_cache; #ifdef CONFIG_FW_LOADER -extern struct builtin_fw __start_builtin_fw[]; -extern struct builtin_fw __end_builtin_fw[]; +extern struct builtin_fw __start_builtin_fw[] __default_visibility; +extern struct builtin_fw __end_builtin_fw[] __default_visibility; static void fw_copy_to_prealloc_buf(struct firmware *fw, void *buf, size_t size) diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 849cd8eb5ca0..0a0e23405ddd 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -32,6 +32,9 @@ * __softirqentry_text_start, __softirqentry_text_end * __start_opd, __end_opd */ +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility push(default) +#endif extern char _text[], _stext[], _etext[]; extern char _data[], _sdata[], _edata[]; extern char __bss_start[], __bss_stop[]; @@ -49,6 +52,9 @@ extern char __start_once[], __end_once[]; /* Start and end of .ctors section - used for constructor calls. */ extern char __ctors_start[], __ctors_end[]; +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility pop +#endif /* Start and end of .opd section - used for function descriptors. */ extern char __start_opd[], __end_opd[]; diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ab4711c63601..a9ac84e37af9 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -278,6 +278,13 @@ unsigned long read_word_at_a_time(const void *addr) __u.__val; \ }) +#ifdef CONFIG_DEFAULT_HIDDEN +#pragma GCC visibility push(hidden) +#define __default_visibility __attribute__((visibility ("default"))) +#else +#define __default_visibility +#endif + #endif /* __KERNEL__ */ #endif /* __ASSEMBLY__ */ diff --git a/init/Kconfig b/init/Kconfig index e4acab9f9fd1..f16247675f84 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1694,6 +1694,13 @@ config PROFILING config TRACEPOINTS bool +# +# Default to hidden visibility for all symbols. +# Useful for Position Independent Code to reduce global references. +# +config DEFAULT_HIDDEN + bool + source &qu
[PATCH v4 14/27] x86/percpu: Adapt percpu for PIE support
Perpcu uses a clever design where the .percu ELF section has a virtual address of zero and the relocation code avoid relocating specific symbols. It makes the code simple and easily adaptable with or without SMP support. This design is incompatible with PIE because generated code always try to access the zero virtual address relative to the default mapping address. It becomes impossible when KASLR is configured to go below -2G. This patch solves this problem by removing the zero mapping and adapting the GS base to be relative to the expected address. These changes are done only when PIE is enabled. The original implementation is kept as-is by default. The assembly and PER_CPU macros are changed to use relative references when PIE is enabled. The KALLSYMS_ABSOLUTE_PERCPU configuration is disabled with PIE given percpu symbols are not absolute in this case. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/entry/calling.h | 2 +- arch/x86/entry/entry_64.S| 4 ++-- arch/x86/include/asm/percpu.h| 25 +++-- arch/x86/include/asm/processor.h | 4 +++- arch/x86/kernel/head_64.S| 4 arch/x86/kernel/setup_percpu.c | 5 - arch/x86/kernel/vmlinux.lds.S| 13 +++-- arch/x86/lib/cmpxchg16b_emu.S| 8 arch/x86/xen/xen-asm.S | 12 ++-- init/Kconfig | 2 +- 10 files changed, 55 insertions(+), 24 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 352e70cd33e8..d6c60e6b598f 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -218,7 +218,7 @@ For 32-bit we have the following conventions - kernel is built with .endm #define THIS_CPU_user_pcid_flush_mask \ - PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask + PER_CPU_VAR(cpu_tlbstate + TLB_STATE_user_pcid_flush_mask) .macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 1cbf4c3616a8..f9b42ca4bf60 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -359,7 +359,7 @@ ENTRY(__switch_to_asm) #ifdef CONFIG_CC_STACKPROTECTOR movqTASK_stack_canary(%rsi), %rbx - movq%rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset + movq%rbx, PER_CPU_VAR(irq_stack_union + stack_canary_offset) #endif #ifdef CONFIG_RETPOLINE @@ -897,7 +897,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw + (TSS_ist + ((x) - 1) * 8)) .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index a06b07399d17..7d1271b536ea 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -5,9 +5,11 @@ #ifdef CONFIG_X86_64 #define __percpu_seg gs #define __percpu_mov_opmovq +#define __percpu_rel (%rip) #else #define __percpu_seg fs #define __percpu_mov_opmovl +#define __percpu_rel #endif #ifdef __ASSEMBLY__ @@ -28,10 +30,14 @@ #define PER_CPU(var, reg) \ __percpu_mov_op %__percpu_seg:this_cpu_off, reg;\ lea var(reg), reg -#define PER_CPU_VAR(var) %__percpu_seg:var +/* Compatible with Position Independent Code */ +#define PER_CPU_VAR(var) %__percpu_seg:(var)##__percpu_rel +/* Rare absolute reference */ +#define PER_CPU_VAR_ABS(var) %__percpu_seg:var #else /* ! SMP */ #define PER_CPU(var, reg) __percpu_mov_op $var, reg -#define PER_CPU_VAR(var) var +#define PER_CPU_VAR(var) (var)##__percpu_rel +#define PER_CPU_VAR_ABS(var) var #endif /* SMP */ #ifdef CONFIG_X86_64_SMP @@ -209,27 +215,34 @@ do { \ pfo_ret__; \ }) +/* Position Independent code uses relative addresses only */ +#ifdef CONFIG_X86_PIE +#define __percpu_stable_arg __percpu_arg(a1) +#else +#define __percpu_stable_arg __percpu_arg(P1) +#endif + #define percpu_stable_op(op, var) \ ({ \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ - asm(op "b "__percpu_arg(P1)",%0"\ + asm(op "b "__percpu_stable_ar
[PATCH v4 09/27] x86/acpi: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier Acked-by: Pavel Machek --- arch/x86/kernel/acpi/wakeup_64.S | 31 --- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 50b8ed0317a3..472659c0f811 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -14,7 +14,7 @@ * Hooray, we are in Long 64-bit mode (but still running in low memory) */ ENTRY(wakeup_long64) - movqsaved_magic, %rax + movqsaved_magic(%rip), %rax movq$0x123456789abcdef0, %rdx cmpq%rdx, %rax jne bogus_64_magic @@ -25,14 +25,14 @@ ENTRY(wakeup_long64) movw%ax, %es movw%ax, %fs movw%ax, %gs - movqsaved_rsp, %rsp + movqsaved_rsp(%rip), %rsp - movqsaved_rbx, %rbx - movqsaved_rdi, %rdi - movqsaved_rsi, %rsi - movqsaved_rbp, %rbp + movqsaved_rbx(%rip), %rbx + movqsaved_rdi(%rip), %rdi + movqsaved_rsi(%rip), %rsi + movqsaved_rbp(%rip), %rbp - movqsaved_rip, %rax + movqsaved_rip(%rip), %rax jmp *%rax ENDPROC(wakeup_long64) @@ -45,7 +45,7 @@ ENTRY(do_suspend_lowlevel) xorl%eax, %eax callsave_processor_state - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movq%rsp, pt_regs_sp(%rax) movq%rbp, pt_regs_bp(%rax) movq%rsi, pt_regs_si(%rax) @@ -64,13 +64,14 @@ ENTRY(do_suspend_lowlevel) pushfq popqpt_regs_flags(%rax) - movq$.Lresume_point, saved_rip(%rip) + leaq.Lresume_point(%rip), %rax + movq%rax, saved_rip(%rip) - movq%rsp, saved_rsp - movq%rbp, saved_rbp - movq%rbx, saved_rbx - movq%rdi, saved_rdi - movq%rsi, saved_rsi + movq%rsp, saved_rsp(%rip) + movq%rbp, saved_rbp(%rip) + movq%rbx, saved_rbx(%rip) + movq%rdi, saved_rdi(%rip) + movq%rsi, saved_rsi(%rip) addq$8, %rsp movl$3, %edi @@ -82,7 +83,7 @@ ENTRY(do_suspend_lowlevel) .align 4 .Lresume_point: /* We don't restore %rax, it must be 0 anyway */ - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movqsaved_context_cr4(%rax), %rbx movq%rbx, %cr4 movqsaved_context_cr3(%rax), %rbx -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 06/27] x86/entry/64: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/entry/entry_64.S| 18 -- arch/x86/kernel/relocate_kernel_64.S | 8 +++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3166b9674429..1cbf4c3616a8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -191,7 +191,7 @@ ENTRY(entry_SYSCALL_64_trampoline) * spill RDI and restore it in a second-stage trampoline. */ pushq %rdi - movq$entry_SYSCALL_64_stage2, %rdi + movabsq $entry_SYSCALL_64_stage2, %rdi JMP_NOSPEC %rdi END(entry_SYSCALL_64_trampoline) @@ -1276,7 +1276,8 @@ ENTRY(error_entry) movl%ecx, %eax /* zero extend */ cmpq%rax, RIP+8(%rsp) je .Lbstep_iret - cmpq$.Lgs_change, RIP+8(%rsp) + leaq.Lgs_change(%rip), %rcx + cmpq%rcx, RIP+8(%rsp) jne .Lerror_entry_done /* @@ -1481,10 +1482,10 @@ ENTRY(nmi) * resume the outer NMI. */ - movq$repeat_nmi, %rdx + leaqrepeat_nmi(%rip), %rdx cmpq8(%rsp), %rdx ja 1f - movq$end_repeat_nmi, %rdx + leaqend_repeat_nmi(%rip), %rdx cmpq8(%rsp), %rdx ja nested_nmi_out 1: @@ -1538,7 +1539,8 @@ nested_nmi: pushq %rdx pushfq pushq $__KERNEL_CS - pushq $repeat_nmi + leaqrepeat_nmi(%rip), %rdx + pushq %rdx /* Put stack back */ addq$(6*8), %rsp @@ -1577,7 +1579,11 @@ first_nmi: addq$8, (%rsp) /* Fix up RSP */ pushfq /* RFLAGS */ pushq $__KERNEL_CS/* CS */ - pushq $1f /* RIP */ + pushq $0 /* Futur return address */ + pushq %rax/* Save RAX */ + leaq1f(%rip), %rax /* RIP */ + movq%rax, 8(%rsp) /* Put 1f on return address */ + popq%rax/* Restore RAX */ iretq /* continues at repeat_nmi below */ UNWIND_HINT_IRET_REGS 1: diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index a7227dfe1a2b..0c0fc259a4e2 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -208,11 +208,9 @@ identity_mapped: movq%rax, %cr3 lea PAGE_SIZE(%r8), %rsp callswap_pages - jmp *virtual_mapped_addr(%rip) - - /* Absolute value for PIE support */ -virtual_mapped_addr: - .quad virtual_mapped + movabsq $virtual_mapped, %rax + pushq %rax + ret virtual_mapped: movqRSP(%r8), %rsp -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 09/27] x86/acpi: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier Acked-by: Pavel Machek --- arch/x86/kernel/acpi/wakeup_64.S | 31 --- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 50b8ed0317a3..472659c0f811 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -14,7 +14,7 @@ * Hooray, we are in Long 64-bit mode (but still running in low memory) */ ENTRY(wakeup_long64) - movqsaved_magic, %rax + movqsaved_magic(%rip), %rax movq$0x123456789abcdef0, %rdx cmpq%rdx, %rax jne bogus_64_magic @@ -25,14 +25,14 @@ ENTRY(wakeup_long64) movw%ax, %es movw%ax, %fs movw%ax, %gs - movqsaved_rsp, %rsp + movqsaved_rsp(%rip), %rsp - movqsaved_rbx, %rbx - movqsaved_rdi, %rdi - movqsaved_rsi, %rsi - movqsaved_rbp, %rbp + movqsaved_rbx(%rip), %rbx + movqsaved_rdi(%rip), %rdi + movqsaved_rsi(%rip), %rsi + movqsaved_rbp(%rip), %rbp - movqsaved_rip, %rax + movqsaved_rip(%rip), %rax jmp *%rax ENDPROC(wakeup_long64) @@ -45,7 +45,7 @@ ENTRY(do_suspend_lowlevel) xorl%eax, %eax callsave_processor_state - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movq%rsp, pt_regs_sp(%rax) movq%rbp, pt_regs_bp(%rax) movq%rsi, pt_regs_si(%rax) @@ -64,13 +64,14 @@ ENTRY(do_suspend_lowlevel) pushfq popqpt_regs_flags(%rax) - movq$.Lresume_point, saved_rip(%rip) + leaq.Lresume_point(%rip), %rax + movq%rax, saved_rip(%rip) - movq%rsp, saved_rsp - movq%rbp, saved_rbp - movq%rbx, saved_rbx - movq%rdi, saved_rdi - movq%rsi, saved_rsi + movq%rsp, saved_rsp(%rip) + movq%rbp, saved_rbp(%rip) + movq%rbx, saved_rbx(%rip) + movq%rdi, saved_rdi(%rip) + movq%rsi, saved_rsi(%rip) addq$8, %rsp movl$3, %edi @@ -82,7 +83,7 @@ ENTRY(do_suspend_lowlevel) .align 4 .Lresume_point: /* We don't restore %rax, it must be 0 anyway */ - movq$saved_context, %rax + leaqsaved_context(%rip), %rax movqsaved_context_cr4(%rax), %rbx movq%rbx, %cr4 movqsaved_context_cr3(%rax), %rbx -- 2.17.0.921.gf22659ad46-goog
[PATCH v4 06/27] x86/entry/64: Adapt assembly for PIE support
Change the assembly code to use only relative references of symbols for the kernel to be PIE compatible. Position Independent Executable (PIE) support will allow to extend the KASLR randomization range 0x8000. Signed-off-by: Thomas Garnier --- arch/x86/entry/entry_64.S| 18 -- arch/x86/kernel/relocate_kernel_64.S | 8 +++- 2 files changed, 15 insertions(+), 11 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3166b9674429..1cbf4c3616a8 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -191,7 +191,7 @@ ENTRY(entry_SYSCALL_64_trampoline) * spill RDI and restore it in a second-stage trampoline. */ pushq %rdi - movq$entry_SYSCALL_64_stage2, %rdi + movabsq $entry_SYSCALL_64_stage2, %rdi JMP_NOSPEC %rdi END(entry_SYSCALL_64_trampoline) @@ -1276,7 +1276,8 @@ ENTRY(error_entry) movl%ecx, %eax /* zero extend */ cmpq%rax, RIP+8(%rsp) je .Lbstep_iret - cmpq$.Lgs_change, RIP+8(%rsp) + leaq.Lgs_change(%rip), %rcx + cmpq%rcx, RIP+8(%rsp) jne .Lerror_entry_done /* @@ -1481,10 +1482,10 @@ ENTRY(nmi) * resume the outer NMI. */ - movq$repeat_nmi, %rdx + leaqrepeat_nmi(%rip), %rdx cmpq8(%rsp), %rdx ja 1f - movq$end_repeat_nmi, %rdx + leaqend_repeat_nmi(%rip), %rdx cmpq8(%rsp), %rdx ja nested_nmi_out 1: @@ -1538,7 +1539,8 @@ nested_nmi: pushq %rdx pushfq pushq $__KERNEL_CS - pushq $repeat_nmi + leaqrepeat_nmi(%rip), %rdx + pushq %rdx /* Put stack back */ addq$(6*8), %rsp @@ -1577,7 +1579,11 @@ first_nmi: addq$8, (%rsp) /* Fix up RSP */ pushfq /* RFLAGS */ pushq $__KERNEL_CS/* CS */ - pushq $1f /* RIP */ + pushq $0 /* Futur return address */ + pushq %rax/* Save RAX */ + leaq1f(%rip), %rax /* RIP */ + movq%rax, 8(%rsp) /* Put 1f on return address */ + popq%rax/* Restore RAX */ iretq /* continues at repeat_nmi below */ UNWIND_HINT_IRET_REGS 1: diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index a7227dfe1a2b..0c0fc259a4e2 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -208,11 +208,9 @@ identity_mapped: movq%rax, %cr3 lea PAGE_SIZE(%r8), %rsp callswap_pages - jmp *virtual_mapped_addr(%rip) - - /* Absolute value for PIE support */ -virtual_mapped_addr: - .quad virtual_mapped + movabsq $virtual_mapped, %rax + pushq %rax + ret virtual_mapped: movqRSP(%r8), %rsp -- 2.17.0.921.gf22659ad46-goog
Re: repeatable boot randomness inside KVM guest
On Mon, Apr 16, 2018 at 8:54 AM Kees Cook <keesc...@chromium.org> wrote: > On Sat, Apr 14, 2018 at 3:44 PM, Theodore Y. Ts'o <ty...@mit.edu> wrote: > > +linux...@kvack.org > > k...@vger.kernel.org, secur...@kernel.org moved to bcc > > > > On Sat, Apr 14, 2018 at 10:59:21PM +0300, Alexey Dobriyan wrote: > >> SLAB allocators got CONFIG_SLAB_FREELIST_RANDOM option which randomizes > >> allocation pattern inside a slab: > >> > >> int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, gfp_t gfp) > >> { > >> ... > >> /* Get best entropy at this stage of boot */ > >> prandom_seed_state(, get_random_long()); > >> > >> Then I printed actual random sequences for each kmem cache. > >> Turned out they were all the same for most of the caches and > >> they didn't vary across guest reboots. > > > > The problem is at the super-early state of the boot path, kernel code > > can't allocate memory. This is something most device drivers kinda > > assume they can do. :-) > > > > So it means we haven't yet initialized the virtio-rng driver, and it's > > before interrupts have been enabled, so we can't harvest any entropy > > from interrupt timing. So that's why trying to use virtio-rng didn't > > help. > > > >> The only way to get randomness for SLAB is to enable RDRAND inside guest. > >> > >> Is it KVM bug? > > > > No, it's not a KVM bug. The fundamental issue is in how the > > CONFIG_SLAB_FREELIST_RANDOM is currently implemented. Entropy at early boot in VM has always been a problem for this feature or others. Did you look at the impact on other boot security features fetching random values? Does your VM had RDRAND support (we use get_random_long() which will fetch from RDRAND to provide as much entropy as possible at this point)? > > > > What needs to happen is freelist should get randomized much later in > > the boot sequence. Doing it later will require locking; I don't know > > enough about the slab/slub code to know whether the slab_mutex would > > be sufficient, or some other lock might need to be added. You can't re-randomize pre-allocated pages that's why the cache is randomized that early. If you don't have RDRAND, we could re-randomize later at boot with more entropy that could be useful in this specific case. > > > > The other thing I would note that is that using prandom_u32_state() doesn't > > really provide much security. In fact, if the the goal is to protect > > against a malicious attacker trying to guess what addresses will be > > returned by the slab allocator, I suspect it's much like the security > > patdowns done at airports. It might protect against a really stupid > > attacker, but it's mostly security theater. > > > > The freelist randomization is only being done once; so it's not like > > performance is really an issue. It would be much better to just use > > get_random_u32() and be done with it. I'd drop using prandom_* > > functions in slab.c and slubct and slab_common.c, and just use a > > really random number generator, if the goal is real security as > > opposed to security for show The state is seeded with get_random_long() which will use RDRAND and any available entropy at this point. I am not sure the value of calling get_random_long() on each iteration especially if you don't have RDRAND. > > > > (Not that there's necessarily any thing wrong with security theater; > > the US spends over 3 billion dollars a year on security theater. As > > politicians know, symbolism can be important. :-) > I've added Thomas Garnier to CC (since he wrote this originally). He > can speak to its position in the boot ordering and the effective > entropy. Thanks for including me. > -Kees > -- > Kees Cook > Pixel Security -- Thomas
Re: repeatable boot randomness inside KVM guest
On Mon, Apr 16, 2018 at 8:54 AM Kees Cook wrote: > On Sat, Apr 14, 2018 at 3:44 PM, Theodore Y. Ts'o wrote: > > +linux...@kvack.org > > k...@vger.kernel.org, secur...@kernel.org moved to bcc > > > > On Sat, Apr 14, 2018 at 10:59:21PM +0300, Alexey Dobriyan wrote: > >> SLAB allocators got CONFIG_SLAB_FREELIST_RANDOM option which randomizes > >> allocation pattern inside a slab: > >> > >> int cache_random_seq_create(struct kmem_cache *cachep, unsigned int count, gfp_t gfp) > >> { > >> ... > >> /* Get best entropy at this stage of boot */ > >> prandom_seed_state(, get_random_long()); > >> > >> Then I printed actual random sequences for each kmem cache. > >> Turned out they were all the same for most of the caches and > >> they didn't vary across guest reboots. > > > > The problem is at the super-early state of the boot path, kernel code > > can't allocate memory. This is something most device drivers kinda > > assume they can do. :-) > > > > So it means we haven't yet initialized the virtio-rng driver, and it's > > before interrupts have been enabled, so we can't harvest any entropy > > from interrupt timing. So that's why trying to use virtio-rng didn't > > help. > > > >> The only way to get randomness for SLAB is to enable RDRAND inside guest. > >> > >> Is it KVM bug? > > > > No, it's not a KVM bug. The fundamental issue is in how the > > CONFIG_SLAB_FREELIST_RANDOM is currently implemented. Entropy at early boot in VM has always been a problem for this feature or others. Did you look at the impact on other boot security features fetching random values? Does your VM had RDRAND support (we use get_random_long() which will fetch from RDRAND to provide as much entropy as possible at this point)? > > > > What needs to happen is freelist should get randomized much later in > > the boot sequence. Doing it later will require locking; I don't know > > enough about the slab/slub code to know whether the slab_mutex would > > be sufficient, or some other lock might need to be added. You can't re-randomize pre-allocated pages that's why the cache is randomized that early. If you don't have RDRAND, we could re-randomize later at boot with more entropy that could be useful in this specific case. > > > > The other thing I would note that is that using prandom_u32_state() doesn't > > really provide much security. In fact, if the the goal is to protect > > against a malicious attacker trying to guess what addresses will be > > returned by the slab allocator, I suspect it's much like the security > > patdowns done at airports. It might protect against a really stupid > > attacker, but it's mostly security theater. > > > > The freelist randomization is only being done once; so it's not like > > performance is really an issue. It would be much better to just use > > get_random_u32() and be done with it. I'd drop using prandom_* > > functions in slab.c and slubct and slab_common.c, and just use a > > really random number generator, if the goal is real security as > > opposed to security for show The state is seeded with get_random_long() which will use RDRAND and any available entropy at this point. I am not sure the value of calling get_random_long() on each iteration especially if you don't have RDRAND. > > > > (Not that there's necessarily any thing wrong with security theater; > > the US spends over 3 billion dollars a year on security theater. As > > politicians know, symbolism can be important. :-) > I've added Thomas Garnier to CC (since he wrote this originally). He > can speak to its position in the boot ordering and the effective > entropy. Thanks for including me. > -Kees > -- > Kees Cook > Pixel Security -- Thomas
Re: [PATCH] ARM: uaccess: Add missing include for set_thread_flag
On Tue, Sep 19, 2017 at 2:35 PM, Jonathan Liu <net...@gmail.com> wrote: > > Hi Thomas, > > The top of the C source file I am compiling has: > #include > #include > > Tracing through the asm/uaccess.h include I see > asm/uaccess.h -> asm/domain.h -> asm/thread_info.h > > but set_thread_info is defined in linux/thread_info.h not > asm/thread_info.h (see > http://elixir.free-electrons.com/linux/v4.14-rc1/ident/set_thread_flag). I see, I think I could not reproduce this issue because I already reverted the original commit in favor of a different approach on linux-next (see commit 2404269bc4e77a67875c8db6667be34c9913c96e). Let me know if this commit resolve the issue and thanks for reaching out. > > Regards, > Jonathan > > On 20 September 2017 at 00:32, Thomas Garnier <thgar...@google.com> wrote: > > On Tue, Sep 19, 2017 at 4:50 AM, Jonathan Liu <net...@gmail.com> wrote: > >> Fixes "implicit declaration of function" compile error for out-of-tree > >> kernel modules including asm/uaccess.h. > > > > I failed to reproduce this issue by creating an out of tree module > > with a separate file (with only uaccess.h). Are you using a special > > config? > > > > Looking at the headers on uaccess.h. You get thread_info through: > > > > asm/domain.h -> asm/thread_info.h > > > >> > >> Fixes: 73ac5d6a2b6a ("arm/syscalls: Check address limit on user-mode > >> return") > >> Signed-off-by: Jonathan Liu <net...@gmail.com> > >> --- > >> arch/arm/include/asm/uaccess.h | 1 + > >> 1 file changed, 1 insertion(+) > >> > >> diff --git a/arch/arm/include/asm/uaccess.h > >> b/arch/arm/include/asm/uaccess.h > >> index 87936dd5d151..13d1877ffb75 100644 > >> --- a/arch/arm/include/asm/uaccess.h > >> +++ b/arch/arm/include/asm/uaccess.h > >> @@ -17,6 +17,7 @@ > >> #include > >> #include > >> > >> +#include > >> #include > >> > >> /* > >> -- > >> 2.13.2 > >> > > > > > > > > -- > > Thomas -- Thomas
Re: [PATCH] ARM: uaccess: Add missing include for set_thread_flag
On Tue, Sep 19, 2017 at 2:35 PM, Jonathan Liu wrote: > > Hi Thomas, > > The top of the C source file I am compiling has: > #include > #include > > Tracing through the asm/uaccess.h include I see > asm/uaccess.h -> asm/domain.h -> asm/thread_info.h > > but set_thread_info is defined in linux/thread_info.h not > asm/thread_info.h (see > http://elixir.free-electrons.com/linux/v4.14-rc1/ident/set_thread_flag). I see, I think I could not reproduce this issue because I already reverted the original commit in favor of a different approach on linux-next (see commit 2404269bc4e77a67875c8db6667be34c9913c96e). Let me know if this commit resolve the issue and thanks for reaching out. > > Regards, > Jonathan > > On 20 September 2017 at 00:32, Thomas Garnier wrote: > > On Tue, Sep 19, 2017 at 4:50 AM, Jonathan Liu wrote: > >> Fixes "implicit declaration of function" compile error for out-of-tree > >> kernel modules including asm/uaccess.h. > > > > I failed to reproduce this issue by creating an out of tree module > > with a separate file (with only uaccess.h). Are you using a special > > config? > > > > Looking at the headers on uaccess.h. You get thread_info through: > > > > asm/domain.h -> asm/thread_info.h > > > >> > >> Fixes: 73ac5d6a2b6a ("arm/syscalls: Check address limit on user-mode > >> return") > >> Signed-off-by: Jonathan Liu > >> --- > >> arch/arm/include/asm/uaccess.h | 1 + > >> 1 file changed, 1 insertion(+) > >> > >> diff --git a/arch/arm/include/asm/uaccess.h > >> b/arch/arm/include/asm/uaccess.h > >> index 87936dd5d151..13d1877ffb75 100644 > >> --- a/arch/arm/include/asm/uaccess.h > >> +++ b/arch/arm/include/asm/uaccess.h > >> @@ -17,6 +17,7 @@ > >> #include > >> #include > >> > >> +#include > >> #include > >> > >> /* > >> -- > >> 2.13.2 > >> > > > > > > > > -- > > Thomas -- Thomas
Re: [PATCH] ARM: uaccess: Add missing include for set_thread_flag
On Tue, Sep 19, 2017 at 4:50 AM, Jonathan Liuwrote: > Fixes "implicit declaration of function" compile error for out-of-tree > kernel modules including asm/uaccess.h. I failed to reproduce this issue by creating an out of tree module with a separate file (with only uaccess.h). Are you using a special config? Looking at the headers on uaccess.h. You get thread_info through: asm/domain.h -> asm/thread_info.h > > Fixes: 73ac5d6a2b6a ("arm/syscalls: Check address limit on user-mode return") > Signed-off-by: Jonathan Liu > --- > arch/arm/include/asm/uaccess.h | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h > index 87936dd5d151..13d1877ffb75 100644 > --- a/arch/arm/include/asm/uaccess.h > +++ b/arch/arm/include/asm/uaccess.h > @@ -17,6 +17,7 @@ > #include > #include > > +#include > #include > > /* > -- > 2.13.2 > -- Thomas
Re: [PATCH] ARM: uaccess: Add missing include for set_thread_flag
On Tue, Sep 19, 2017 at 4:50 AM, Jonathan Liu wrote: > Fixes "implicit declaration of function" compile error for out-of-tree > kernel modules including asm/uaccess.h. I failed to reproduce this issue by creating an out of tree module with a separate file (with only uaccess.h). Are you using a special config? Looking at the headers on uaccess.h. You get thread_info through: asm/domain.h -> asm/thread_info.h > > Fixes: 73ac5d6a2b6a ("arm/syscalls: Check address limit on user-mode return") > Signed-off-by: Jonathan Liu > --- > arch/arm/include/asm/uaccess.h | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h > index 87936dd5d151..13d1877ffb75 100644 > --- a/arch/arm/include/asm/uaccess.h > +++ b/arch/arm/include/asm/uaccess.h > @@ -17,6 +17,7 @@ > #include > #include > > +#include > #include > > /* > -- > 2.13.2 > -- Thomas
[tip:core/urgent] arm64/syscalls: Move address limit check in loop
Commit-ID: a2048e34d4655c06d31400646ae495bbfeb16b27 Gitweb: http://git.kernel.org/tip/a2048e34d4655c06d31400646ae495bbfeb16b27 Author: Thomas Garnier <thgar...@google.com> AuthorDate: Thu, 7 Sep 2017 08:30:47 -0700 Committer: Thomas Gleixner <t...@linutronix.de> CommitDate: Sun, 17 Sep 2017 19:45:33 +0200 arm64/syscalls: Move address limit check in loop A bug was reported on ARM where set_fs might be called after it was checked on the work pending function. ARM64 is not affected by this bug but has a similar construct. In order to avoid any similar problems in the future, the addr_limit_user_check function is moved at the beginning of the loop. Fixes: cf7de27ab351 ("arm64/syscalls: Check address limit on user-mode return") Reported-by: Leonard Crestez <leonard.cres...@nxp.com> Signed-off-by: Thomas Garnier <thgar...@google.com> Signed-off-by: Kees Cook <keesc...@chromium.org> Signed-off-by: Thomas Gleixner <t...@linutronix.de> Cc: Pratyush Anand <pan...@redhat.com> Cc: Dave Martin <dave.mar...@arm.com> Cc: Will Drewry <w...@chromium.org> Cc: Arnd Bergmann <a...@arndb.de> Cc: Catalin Marinas <catalin.mari...@arm.com> Cc: Will Deacon <will.dea...@arm.com> Cc: Russell King <li...@armlinux.org.uk> Cc: Andy Lutomirski <l...@amacapital.net> Cc: David Howells <dhowe...@redhat.com> Cc: Dave Hansen <dave.han...@intel.com> Cc: Al Viro <v...@zeniv.linux.org.uk> Cc: linux-...@vger.kernel.org Cc: Yonghong Song <y...@fb.com> Cc: linux-arm-ker...@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-5-git-send-email-keesc...@chromium.org --- arch/arm64/kernel/signal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index c45214f..0bdc96c 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -751,10 +751,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, */ trace_hardirqs_off(); - /* Check valid user FS if needed */ - addr_limit_user_check(); - do { + /* Check valid user FS if needed */ + addr_limit_user_check(); + if (thread_flags & _TIF_NEED_RESCHED) { schedule(); } else {
[tip:core/urgent] arm64/syscalls: Move address limit check in loop
Commit-ID: a2048e34d4655c06d31400646ae495bbfeb16b27 Gitweb: http://git.kernel.org/tip/a2048e34d4655c06d31400646ae495bbfeb16b27 Author: Thomas Garnier AuthorDate: Thu, 7 Sep 2017 08:30:47 -0700 Committer: Thomas Gleixner CommitDate: Sun, 17 Sep 2017 19:45:33 +0200 arm64/syscalls: Move address limit check in loop A bug was reported on ARM where set_fs might be called after it was checked on the work pending function. ARM64 is not affected by this bug but has a similar construct. In order to avoid any similar problems in the future, the addr_limit_user_check function is moved at the beginning of the loop. Fixes: cf7de27ab351 ("arm64/syscalls: Check address limit on user-mode return") Reported-by: Leonard Crestez Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-...@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-ker...@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-5-git-send-email-keesc...@chromium.org --- arch/arm64/kernel/signal.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index c45214f..0bdc96c 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -751,10 +751,10 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, */ trace_hardirqs_off(); - /* Check valid user FS if needed */ - addr_limit_user_check(); - do { + /* Check valid user FS if needed */ + addr_limit_user_check(); + if (thread_flags & _TIF_NEED_RESCHED) { schedule(); } else {
[tip:core/urgent] arm/syscalls: Optimize address limit check
Commit-ID: e33f8d32677fa4f4f8996ef46748f86aac81ccff Gitweb: http://git.kernel.org/tip/e33f8d32677fa4f4f8996ef46748f86aac81ccff Author: Thomas Garnier <thgar...@google.com> AuthorDate: Thu, 7 Sep 2017 08:30:46 -0700 Committer: Thomas Gleixner <t...@linutronix.de> CommitDate: Sun, 17 Sep 2017 19:45:33 +0200 arm/syscalls: Optimize address limit check Disable the generic address limit check in favor of an architecture specific optimized implementation. The generic implementation using pending work flags did not work well with ARM and alignment faults. The address limit is checked on each syscall return path to user-mode path as well as the irq user-mode return function. If the address limit was changed, a function is called to report data corruption (stopping the kernel or process based on configuration). The address limit check has to be done before any pending work because they can reset the address limit and the process is killed using a SIGKILL signal. For example the lkdtm address limit check does not work because the signal to kill the process will reset the user-mode address limit. Signed-off-by: Thomas Garnier <thgar...@google.com> Signed-off-by: Kees Cook <keesc...@chromium.org> Tested-by: Kees Cook <keesc...@chromium.org> Tested-by: Leonard Crestez <leonard.cres...@nxp.com> Reviewed-by: Kees Cook <keesc...@chromium.org> Signed-off-by: Thomas Gleixner <t...@linutronix.de> Cc: Pratyush Anand <pan...@redhat.com> Cc: Dave Martin <dave.mar...@arm.com> Cc: Will Drewry <w...@chromium.org> Cc: Arnd Bergmann <a...@arndb.de> Cc: Catalin Marinas <catalin.mari...@arm.com> Cc: Will Deacon <will.dea...@arm.com> Cc: Russell King <li...@armlinux.org.uk> Cc: Andy Lutomirski <l...@amacapital.net> Cc: David Howells <dhowe...@redhat.com> Cc: Dave Hansen <dave.han...@intel.com> Cc: Al Viro <v...@zeniv.linux.org.uk> Cc: linux-...@vger.kernel.org Cc: Yonghong Song <y...@fb.com> Cc: linux-arm-ker...@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-4-git-send-email-keesc...@chromium.org --- arch/arm/kernel/entry-common.S | 11 +++ arch/arm/kernel/signal.c | 7 +++ 2 files changed, 18 insertions(+) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 0b60adf..99c9082 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_AEABI #include #endif @@ -48,10 +49,14 @@ ret_fast_syscall: UNWIND(.fnstart ) UNWIND(.cantunwind) disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blneaddr_limit_check_failed ldr r1, [tsk, #TI_FLAGS]@ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK bne fast_work_pending + /* perform architecture specific actions before user return */ arch_ret_to_user r1, lr @@ -74,6 +79,9 @@ ret_fast_syscall: UNWIND(.cantunwind) str r0, [sp, #S_R0 + S_OFF]!@ save returned r0 disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blneaddr_limit_check_failed ldr r1, [tsk, #TI_FLAGS]@ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK beq no_work_pending @@ -106,6 +114,9 @@ ENTRY(ret_to_user) ret_slow_syscall: disable_irq_notrace @ disable interrupts ENTRY(ret_to_user_from_irq) + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blneaddr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] tst r1, #_TIF_WORK_MASK bne slow_work_pending diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 5814298..b67ae12 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -673,3 +674,9 @@ struct page *get_signal_page(void) return page; } + +/* Defer to generic check */ +asmlinkage void addr_limit_check_failed(void) +{ + addr_limit_user_check(); +}
[tip:core/urgent] arm/syscalls: Optimize address limit check
Commit-ID: e33f8d32677fa4f4f8996ef46748f86aac81ccff Gitweb: http://git.kernel.org/tip/e33f8d32677fa4f4f8996ef46748f86aac81ccff Author: Thomas Garnier AuthorDate: Thu, 7 Sep 2017 08:30:46 -0700 Committer: Thomas Gleixner CommitDate: Sun, 17 Sep 2017 19:45:33 +0200 arm/syscalls: Optimize address limit check Disable the generic address limit check in favor of an architecture specific optimized implementation. The generic implementation using pending work flags did not work well with ARM and alignment faults. The address limit is checked on each syscall return path to user-mode path as well as the irq user-mode return function. If the address limit was changed, a function is called to report data corruption (stopping the kernel or process based on configuration). The address limit check has to be done before any pending work because they can reset the address limit and the process is killed using a SIGKILL signal. For example the lkdtm address limit check does not work because the signal to kill the process will reset the user-mode address limit. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Tested-by: Kees Cook Tested-by: Leonard Crestez Reviewed-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-...@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-ker...@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-4-git-send-email-keesc...@chromium.org --- arch/arm/kernel/entry-common.S | 11 +++ arch/arm/kernel/signal.c | 7 +++ 2 files changed, 18 insertions(+) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index 0b60adf..99c9082 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_AEABI #include #endif @@ -48,10 +49,14 @@ ret_fast_syscall: UNWIND(.fnstart ) UNWIND(.cantunwind) disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blneaddr_limit_check_failed ldr r1, [tsk, #TI_FLAGS]@ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK bne fast_work_pending + /* perform architecture specific actions before user return */ arch_ret_to_user r1, lr @@ -74,6 +79,9 @@ ret_fast_syscall: UNWIND(.cantunwind) str r0, [sp, #S_R0 + S_OFF]!@ save returned r0 disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blneaddr_limit_check_failed ldr r1, [tsk, #TI_FLAGS]@ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK beq no_work_pending @@ -106,6 +114,9 @@ ENTRY(ret_to_user) ret_slow_syscall: disable_irq_notrace @ disable interrupts ENTRY(ret_to_user_from_irq) + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blneaddr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] tst r1, #_TIF_WORK_MASK bne slow_work_pending diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 5814298..b67ae12 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -673,3 +674,9 @@ struct page *get_signal_page(void) return page; } + +/* Defer to generic check */ +asmlinkage void addr_limit_check_failed(void) +{ + addr_limit_user_check(); +}
[tip:core/urgent] Revert "arm/syscalls: Check address limit on user-mode return"
Commit-ID: 2404269bc4e77a67875c8db6667be34c9913c96e Gitweb: http://git.kernel.org/tip/2404269bc4e77a67875c8db6667be34c9913c96e Author: Thomas Garnier <thgar...@google.com> AuthorDate: Thu, 7 Sep 2017 08:30:45 -0700 Committer: Thomas Gleixner <t...@linutronix.de> CommitDate: Sun, 17 Sep 2017 19:45:33 +0200 Revert "arm/syscalls: Check address limit on user-mode return" This reverts commit 73ac5d6a2b6ac3ae8d1e1818f3e9946f97489bc9. The work pending loop can call set_fs after addr_limit_user_check removed the _TIF_FSCHECK flag. This may happen at anytime based on how ARM handles alignment exceptions. It leads to an infinite loop condition. After discussion, it has been agreed that the generic approach is not tailored to the ARM architecture and any fix might not be complete. This patch will be replaced by an architecture specific implementation. The work flag approach will be kept for other architectures. Reported-by: Leonard Crestez <leonard.cres...@nxp.com> Signed-off-by: Thomas Garnier <thgar...@google.com> Signed-off-by: Kees Cook <keesc...@chromium.org> Signed-off-by: Thomas Gleixner <t...@linutronix.de> Cc: Pratyush Anand <pan...@redhat.com> Cc: Dave Martin <dave.mar...@arm.com> Cc: Will Drewry <w...@chromium.org> Cc: Arnd Bergmann <a...@arndb.de> Cc: Catalin Marinas <catalin.mari...@arm.com> Cc: Will Deacon <will.dea...@arm.com> Cc: Russell King <li...@armlinux.org.uk> Cc: Andy Lutomirski <l...@amacapital.net> Cc: David Howells <dhowe...@redhat.com> Cc: Dave Hansen <dave.han...@intel.com> Cc: Al Viro <v...@zeniv.linux.org.uk> Cc: linux-...@vger.kernel.org Cc: Yonghong Song <y...@fb.com> Cc: linux-arm-ker...@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-3-git-send-email-keesc...@chromium.org --- arch/arm/include/asm/thread_info.h | 15 ++- arch/arm/include/asm/uaccess.h | 2 -- arch/arm/kernel/entry-common.S | 9 ++--- arch/arm/kernel/signal.c | 5 - 4 files changed, 8 insertions(+), 23 deletions(-) diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 1d468b5..776757d 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -139,11 +139,10 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_UPROBE 3 /* breakpointed or singlestepping */ -#define TIF_FSCHECK4 /* Check FS is USER_DS on return */ -#define TIF_SYSCALL_TRACE 5 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ -#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */ -#define TIF_SECCOMP8 /* seccomp syscall filtering active */ +#define TIF_SYSCALL_TRACE 4 /* syscall trace active */ +#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ +#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ +#define TIF_SECCOMP7 /* seccomp syscall filtering active */ #define TIF_NOHZ 12 /* in adaptive nohz mode */ #define TIF_USING_IWMMXT 17 @@ -154,7 +153,6 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_UPROBE(1 << TIF_UPROBE) -#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT) @@ -168,9 +166,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, /* * Change these and you break ASM code in entry-common.S */ -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ -_TIF_NOTIFY_RESUME | _TIF_UPROBE | \ -_TIF_FSCHECK) +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +_TIF_NOTIFY_RESUME | _TIF_UPROBE) #endif /* __KERNEL__ */ #endif /* __ASM_ARM_THREAD_INFO_H */ diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 87936dd..0bf2347 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -70,8 +70,6 @@ static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER); - /* On user-mode return, check fs is correct */ - set_thread_flag(TIF_FSCHECK); } #define segment_eq(a, b) ((a)
[tip:core/urgent] Revert "arm/syscalls: Check address limit on user-mode return"
Commit-ID: 2404269bc4e77a67875c8db6667be34c9913c96e Gitweb: http://git.kernel.org/tip/2404269bc4e77a67875c8db6667be34c9913c96e Author: Thomas Garnier AuthorDate: Thu, 7 Sep 2017 08:30:45 -0700 Committer: Thomas Gleixner CommitDate: Sun, 17 Sep 2017 19:45:33 +0200 Revert "arm/syscalls: Check address limit on user-mode return" This reverts commit 73ac5d6a2b6ac3ae8d1e1818f3e9946f97489bc9. The work pending loop can call set_fs after addr_limit_user_check removed the _TIF_FSCHECK flag. This may happen at anytime based on how ARM handles alignment exceptions. It leads to an infinite loop condition. After discussion, it has been agreed that the generic approach is not tailored to the ARM architecture and any fix might not be complete. This patch will be replaced by an architecture specific implementation. The work flag approach will be kept for other architectures. Reported-by: Leonard Crestez Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-...@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-ker...@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-3-git-send-email-keesc...@chromium.org --- arch/arm/include/asm/thread_info.h | 15 ++- arch/arm/include/asm/uaccess.h | 2 -- arch/arm/kernel/entry-common.S | 9 ++--- arch/arm/kernel/signal.c | 5 - 4 files changed, 8 insertions(+), 23 deletions(-) diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 1d468b5..776757d 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -139,11 +139,10 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define TIF_NEED_RESCHED 1 /* rescheduling necessary */ #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_UPROBE 3 /* breakpointed or singlestepping */ -#define TIF_FSCHECK4 /* Check FS is USER_DS on return */ -#define TIF_SYSCALL_TRACE 5 /* syscall trace active */ -#define TIF_SYSCALL_AUDIT 6 /* syscall auditing active */ -#define TIF_SYSCALL_TRACEPOINT 7 /* syscall tracepoint instrumentation */ -#define TIF_SECCOMP8 /* seccomp syscall filtering active */ +#define TIF_SYSCALL_TRACE 4 /* syscall trace active */ +#define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ +#define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ +#define TIF_SECCOMP7 /* seccomp syscall filtering active */ #define TIF_NOHZ 12 /* in adaptive nohz mode */ #define TIF_USING_IWMMXT 17 @@ -154,7 +153,6 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) #define _TIF_UPROBE(1 << TIF_UPROBE) -#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT(1 << TIF_SYSCALL_TRACEPOINT) @@ -168,9 +166,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp __user *, /* * Change these and you break ASM code in entry-common.S */ -#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ -_TIF_NOTIFY_RESUME | _TIF_UPROBE | \ -_TIF_FSCHECK) +#define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ +_TIF_NOTIFY_RESUME | _TIF_UPROBE) #endif /* __KERNEL__ */ #endif /* __ASM_ARM_THREAD_INFO_H */ diff --git a/arch/arm/include/asm/uaccess.h b/arch/arm/include/asm/uaccess.h index 87936dd..0bf2347 100644 --- a/arch/arm/include/asm/uaccess.h +++ b/arch/arm/include/asm/uaccess.h @@ -70,8 +70,6 @@ static inline void set_fs(mm_segment_t fs) { current_thread_info()->addr_limit = fs; modify_domain(DOMAIN_KERNEL, fs ? DOMAIN_CLIENT : DOMAIN_MANAGER); - /* On user-mode return, check fs is correct */ - set_thread_flag(TIF_FSCHECK); } #define segment_eq(a, b) ((a) == (b)) diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index ca3614d..0b60adf 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -49,9 +49,7 @@ ret_fast_syscall: UNWIND(.cantunwind) disable_irq_notrace @ disable interrupts ldr r1, [tsk, #TI_FLAGS]@ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK - bne fast_work_pending
[tip:core/urgent] syscalls: Use CHECK_DATA_CORRUPTION for addr_limit_user_check
Commit-ID: bf29ed1567b67854dc13504f685c45a2ea9b2081 Gitweb: http://git.kernel.org/tip/bf29ed1567b67854dc13504f685c45a2ea9b2081 Author: Thomas Garnier <thgar...@google.com> AuthorDate: Thu, 7 Sep 2017 08:30:44 -0700 Committer: Thomas Gleixner <t...@linutronix.de> CommitDate: Sun, 17 Sep 2017 19:45:32 +0200 syscalls: Use CHECK_DATA_CORRUPTION for addr_limit_user_check Use CHECK_DATA_CORRUPTION instead of BUG_ON to provide more flexibility on address limit failures. By default, send a SIGKILL signal to kill the current process preventing exploitation of a bad address limit. Make the TIF_FSCHECK flag optional so ARM can use this function. Signed-off-by: Thomas Garnier <thgar...@google.com> Signed-off-by: Kees Cook <keesc...@chromium.org> Signed-off-by: Thomas Gleixner <t...@linutronix.de> Cc: Pratyush Anand <pan...@redhat.com> Cc: Dave Martin <dave.mar...@arm.com> Cc: Will Drewry <w...@chromium.org> Cc: Arnd Bergmann <a...@arndb.de> Cc: Catalin Marinas <catalin.mari...@arm.com> Cc: Will Deacon <will.dea...@arm.com> Cc: Russell King <li...@armlinux.org.uk> Cc: Andy Lutomirski <l...@amacapital.net> Cc: David Howells <dhowe...@redhat.com> Cc: Dave Hansen <dave.han...@intel.com> Cc: Al Viro <v...@zeniv.linux.org.uk> Cc: linux-...@vger.kernel.org Cc: Yonghong Song <y...@fb.com> Cc: linux-arm-ker...@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-2-git-send-email-keesc...@chromium.org --- include/linux/syscalls.h | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 95606a2..a78186d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -221,21 +221,25 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) } \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) -#ifdef TIF_FSCHECK /* * Called before coming back to user-mode. Returning to user-mode with an * address limit different than USER_DS can allow to overwrite kernel memory. */ static inline void addr_limit_user_check(void) { - +#ifdef TIF_FSCHECK if (!test_thread_flag(TIF_FSCHECK)) return; +#endif - BUG_ON(!segment_eq(get_fs(), USER_DS)); + if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS), + "Invalid address limit on user-mode return")) + force_sig(SIGKILL, current); + +#ifdef TIF_FSCHECK clear_thread_flag(TIF_FSCHECK); -} #endif +} asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr);
[tip:core/urgent] syscalls: Use CHECK_DATA_CORRUPTION for addr_limit_user_check
Commit-ID: bf29ed1567b67854dc13504f685c45a2ea9b2081 Gitweb: http://git.kernel.org/tip/bf29ed1567b67854dc13504f685c45a2ea9b2081 Author: Thomas Garnier AuthorDate: Thu, 7 Sep 2017 08:30:44 -0700 Committer: Thomas Gleixner CommitDate: Sun, 17 Sep 2017 19:45:32 +0200 syscalls: Use CHECK_DATA_CORRUPTION for addr_limit_user_check Use CHECK_DATA_CORRUPTION instead of BUG_ON to provide more flexibility on address limit failures. By default, send a SIGKILL signal to kill the current process preventing exploitation of a bad address limit. Make the TIF_FSCHECK flag optional so ARM can use this function. Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Signed-off-by: Thomas Gleixner Cc: Pratyush Anand Cc: Dave Martin Cc: Will Drewry Cc: Arnd Bergmann Cc: Catalin Marinas Cc: Will Deacon Cc: Russell King Cc: Andy Lutomirski Cc: David Howells Cc: Dave Hansen Cc: Al Viro Cc: linux-...@vger.kernel.org Cc: Yonghong Song Cc: linux-arm-ker...@lists.infradead.org Link: http://lkml.kernel.org/r/1504798247-48833-2-git-send-email-keesc...@chromium.org --- include/linux/syscalls.h | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 95606a2..a78186d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -221,21 +221,25 @@ static inline int is_syscall_trace_event(struct trace_event_call *tp_event) } \ static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)) -#ifdef TIF_FSCHECK /* * Called before coming back to user-mode. Returning to user-mode with an * address limit different than USER_DS can allow to overwrite kernel memory. */ static inline void addr_limit_user_check(void) { - +#ifdef TIF_FSCHECK if (!test_thread_flag(TIF_FSCHECK)) return; +#endif - BUG_ON(!segment_eq(get_fs(), USER_DS)); + if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS), + "Invalid address limit on user-mode return")) + force_sig(SIGKILL, current); + +#ifdef TIF_FSCHECK clear_thread_flag(TIF_FSCHECK); -} #endif +} asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr);