On 06/24/19 at 03:31pm, Kirill A. Shutemov wrote:
> Kyle has reported that kernel crashes sometimes when it boots in
> 5-level paging mode with KASLR enabled:
> 
>   WARNING: CPU: 0 PID: 0 at arch/x86/mm/init_64.c:87 phys_p4d_init+0x1d4/0x1ea
>   RIP: 0010:phys_p4d_init+0x1d4/0x1ea
>   Call Trace:
>    __kernel_physical_mapping_init+0x10a/0x35c
>    kernel_physical_mapping_init+0xe/0x10
>    init_memory_mapping+0x1aa/0x3b0
>    init_range_memory_mapping+0xc8/0x116
>    init_mem_mapping+0x225/0x2eb
>    setup_arch+0x6ff/0xcf5
>    start_kernel+0x64/0x53b
>    ? copy_bootdata+0x1f/0xce
>    x86_64_start_reservations+0x24/0x26
>    x86_64_start_kernel+0x8a/0x8d
>    secondary_startup_64+0xb6/0xc0
> 
> which causes later:
> 
>   BUG: unable to handle page fault for address: ff484d019580eff8
>   #PF: supervisor read access in kernel mode
>   #PF: error_code(0x0000) - not-present page
>   BAD
>   Oops: 0000 [#1] SMP NOPTI
>   RIP: 0010:fill_pud+0x13/0x130
>   Call Trace:
>    set_pte_vaddr_p4d+0x2e/0x50
>    set_pte_vaddr+0x6f/0xb0
>    __native_set_fixmap+0x28/0x40
>    native_set_fixmap+0x39/0x70
>    register_lapic_address+0x49/0xb6
>    early_acpi_boot_init+0xa5/0xde
>    setup_arch+0x944/0xcf5
>    start_kernel+0x64/0x53b
> 
> Kyle bisected the issue to commit b569c1843498 ("x86/mm/KASLR: Reduce
> randomization granularity for 5-level paging to 1GB")
> 
> Before the commit PAGE_OFFSET is always aligned to P4D_SIZE if we boot in
> 5-level paging mode. But now only PUD_SIZE alignment is guaranteed.
> 
> For phys_p4d_init() it means that 'paddr_next' after the first iteration
> can belong to the same p4d entry.
> 
> In the case I was able to reproduce the 'vaddr' on the first iteration
> is 0xff4228027fe00000 and 'paddr' is 0x33fe00000. On the second
> iteration 'vaddr' becomes 0xff42287f40000000 and 'paddr' is
> 0x8000000000. The 'vaddr' in both cases belong to the same p4d entry.
> 
> It screws up 'i' count: we miss the last entry in the page table
> completely.  And it confuses the branch under 'paddr >= paddr_end'
> condition: the p4d entry can be cleared where must not to.
> 
> The patch makes phys_p4d_init() walk by virtual address space, like
> __kernel_physical_mapping_init() does. It makes it work correctly with
> phys-virt mismatch.
> 
> Signed-off-by: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
> Reported-and-tested-by: Kyle Pelton <kyle.d.pel...@intel.com>
> Fixes: b569c1843498 ("x86/mm/KASLR: Reduce randomization granularity for 
> 5-level paging to 1GB")
> Cc: Baoquan He <b...@redhat.com>
> Signed-off-by: Kirill A. Shutemov <kirill.shute...@linux.intel.com>
> ---
>  arch/x86/mm/init_64.c | 24 +++++++++++++-----------
>  1 file changed, 13 insertions(+), 11 deletions(-)

Looks good to me, thanks.

Acked-by: Baoquan He <b...@redhat.com>

> 
> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
> index 693aaf28d5fe..0f01c7b1d217 100644
> --- a/arch/x86/mm/init_64.c
> +++ b/arch/x86/mm/init_64.c
> @@ -671,23 +671,25 @@ static unsigned long __meminit
>  phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, unsigned long paddr_end,
>             unsigned long page_size_mask, bool init)
>  {
> -     unsigned long paddr_next, paddr_last = paddr_end;
> -     unsigned long vaddr = (unsigned long)__va(paddr);
> -     int i = p4d_index(vaddr);
> +     unsigned long vaddr, vaddr_end, vaddr_next, paddr_next, paddr_last;
> +
> +     paddr_last = paddr_end;
> +     vaddr = (unsigned long)__va(paddr);
> +     vaddr_end = (unsigned long)__va(paddr_end);
>  
>       if (!pgtable_l5_enabled())
>               return phys_pud_init((pud_t *) p4d_page, paddr, paddr_end,
>                                    page_size_mask, init);
>  
> -     for (; i < PTRS_PER_P4D; i++, paddr = paddr_next) {
> -             p4d_t *p4d;
> +     for (; vaddr < vaddr_end; vaddr = vaddr_next) {
> +             p4d_t *p4d = p4d_page + p4d_index(vaddr);
>               pud_t *pud;
>  
> -             vaddr = (unsigned long)__va(paddr);
> -             p4d = p4d_page + p4d_index(vaddr);
> -             paddr_next = (paddr & P4D_MASK) + P4D_SIZE;
> +             vaddr_next = (vaddr & P4D_MASK) + P4D_SIZE;
> +             paddr = __pa(vaddr);
>  
>               if (paddr >= paddr_end) {
> +                     paddr_next = __pa(vaddr_next);
>                       if (!after_bootmem &&
>                           !e820__mapped_any(paddr & P4D_MASK, paddr_next,
>                                            E820_TYPE_RAM) &&
> @@ -699,13 +701,13 @@ phys_p4d_init(p4d_t *p4d_page, unsigned long paddr, 
> unsigned long paddr_end,
>  
>               if (!p4d_none(*p4d)) {
>                       pud = pud_offset(p4d, 0);
> -                     paddr_last = phys_pud_init(pud, paddr, paddr_end,
> -                                                page_size_mask, init);
> +                     paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
> +                                     page_size_mask, init);
>                       continue;
>               }
>  
>               pud = alloc_low_page();
> -             paddr_last = phys_pud_init(pud, paddr, paddr_end,
> +             paddr_last = phys_pud_init(pud, paddr, __pa(vaddr_end),
>                                          page_size_mask, init);
>  
>               spin_lock(&init_mm.page_table_lock);
> -- 
> 2.21.0
> 

Reply via email to