On SGI UV system, kernel casually hang with kaslr enabled. The back trace is:
kernel BUG at arch/x86/mm/init_64.c:311! invalid opcode: 0000 [#1] SMP [...] RIP: 0010:__init_extra_mapping+0x188/0x196 [...] Call Trace: init_extra_mapping_uc+0x13/0x15 map_high+0x67/0x75 map_mmioh_high_uv3+0x20a/0x219 uv_system_init_hub+0x12d9/0x1496 uv_system_init+0x27/0x29 native_smp_prepare_cpus+0x28d/0x2d8 kernel_init_freeable+0xdd/0x253 ? rest_init+0x80/0x80 kernel_init+0xe/0x110 ret_from_fork+0x2c/0x40 The root cause is SGI UV system needs map its MMIOH region to direct mapping section. When kaslr disabled, there are 64TB space for system RAM to do direct mapping. Both system RAM and SGI UV MMIOH region share this 64TB space. However with kaslr enabled, mm KASLR only reserves the actual size of system RAM plus 10TB for direct mapping usage. Then MMIOH mapping of SGI UV could go beyond the upper bound of direct mapping section to step into VMALLOC or VMEMMAP area. Then the BUG_ON() in __init_extra_mapping() will be triggered. E.g on the SGI UV3 machine where this bug is reported , there are two MMIOH regions: [ 1.519001] UV: Map MMIOH0_HI 0xffc00000000 - 0x100000000000 [ 1.523001] UV: Map MMIOH1_HI 0x100000000000 - 0x200000000000 They are [16TB-16G, 16TB) and [16TB, 32TB). On this machine, 512G ram are spread out to 1TB regions. Then above two SGI MMIOH regions also will be mapped into the direct mapping section. The point is that SGI UV maps its MMIOH regions to the direct mapping section in uv_system_init() which is called during kernel_init_freeable(). It's much later than kernel_randomize_memory(). Mm KASLR has no chance to take it into consideration. To fix it, we need detect if it's SGI UV system early, then do not adapt the size of the direct mapping section in kernel_randomize_memory() if yes. According to discussion with SGI developer, "The SGI bios adds UVsystab. Only systems running SGI bios (and now HPE Hawks2) will have UVsystab." Hence check if UVsystab is present, if yes, do not adapt the size of the direct mapping section in kernel_randomize_memory() Signed-off-by: Baoquan He <b...@redhat.com> Cc: Thomas Gleixner <t...@linutronix.de> Cc: Ingo Molnar <mi...@redhat.com> Cc: "H. Peter Anvin" <h...@zytor.com> Cc: x...@kernel.org Cc: Matt Fleming <m...@codeblueprint.co.uk> Cc: Ard Biesheuvel <ard.biesheu...@linaro.org> Cc: "tra...@sgi.com" <tra...@sgi.com> Cc: Dimitri Sivanich <sivan...@hpe.com> Cc: Thomas Garnier <thgar...@google.com> Cc: Kees Cook <keesc...@chromium.org> Cc: Andrew Morton <a...@linux-foundation.org> Cc: Masahiro Yamada <yamada.masah...@socionext.com> Cc: Russ Anderson <r...@sgi.com> Cc: Frank Ramsay <frank.ram...@hpe.com> linux-...@vger.kernel.org --- arch/x86/include/asm/uv/uv.h | 2 ++ arch/x86/mm/kaslr.c | 4 +++- arch/x86/platform/efi/efi.c | 7 +++++++ 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h index 6686820..0275776 100644 --- a/arch/x86/include/asm/uv/uv.h +++ b/arch/x86/include/asm/uv/uv.h @@ -11,6 +11,7 @@ struct mm_struct; extern enum uv_system_type get_uv_system_type(void); extern int is_uv_system(void); extern int is_uv_hubless(void); +extern int is_early_uv_system(void); extern void uv_cpu_init(void); extern void uv_nmi_init(void); extern void uv_system_init(void); @@ -25,6 +26,7 @@ extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, static inline enum uv_system_type get_uv_system_type(void) { return UV_NONE; } static inline int is_uv_system(void) { return 0; } static inline int is_uv_hubless(void) { return 0; } +static inline int is_early_uv_system(void) { return 0; } static inline void uv_cpu_init(void) { } static inline void uv_system_init(void) { } static inline const struct cpumask * diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index aed2064..b75e1f5 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -22,11 +22,13 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/random.h> +#include <linux/efi.h> #include <asm/pgalloc.h> #include <asm/pgtable.h> #include <asm/setup.h> #include <asm/kaslr.h> +#include <asm/uv/uv.h> #include "mm_internal.h" @@ -123,7 +125,7 @@ void __init kernel_randomize_memory(void) CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; /* Adapt phyiscal memory region size based on available memory */ - if (memory_tb < kaslr_regions[0].size_tb) + if (memory_tb < kaslr_regions[0].size_tb && !is_early_uv_system()) kaslr_regions[0].size_tb = memory_tb; /* Calculate entropy available between regions */ diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 7e76a4d..b71e01d 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -74,6 +74,13 @@ static int __init setup_add_efi_memmap(char *arg) } early_param("add_efi_memmap", setup_add_efi_memmap); +#ifdef CONFIG_X86_UV +int is_early_uv_system(void) +{ + return !((efi.uv_systab == EFI_INVALID_TABLE_ADDR) || !efi.uv_systab); +} +#endif + static efi_status_t __init phys_efi_set_virtual_address_map( unsigned long memory_map_size, unsigned long descriptor_size, -- 2.5.5