On 12/12/2012 09:12 PM, H. Peter Anvin wrote:
Here is a version that compiles.  It doesn't *boot* yet, because the
switchover from dynamic mode to the real pagetables doesn't happen right
and so we end up on an uninitialized set of page tables.

The new page table setup in tip:x86/mm2 should make that easier to
achieve, however... I won't have time to test this out tonight, though.

     -hpa

Well, minus a simple brainfart now it actually gets into the page table setup.

        -hpa


--
H. Peter Anvin, Intel Open Source Technology Center
I work for Intel.  I don't speak on their behalf.

diff --git a/arch/x86/include/asm/pgtable_64_types.h 
b/arch/x86/include/asm/pgtable_64_types.h
index 766ea16..2d88344 100644
--- a/arch/x86/include/asm/pgtable_64_types.h
+++ b/arch/x86/include/asm/pgtable_64_types.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_PGTABLE_64_DEFS_H
 #define _ASM_X86_PGTABLE_64_DEFS_H
 
+#include <asm/sparsemem.h>
+
 #ifndef __ASSEMBLY__
 #include <linux/types.h>
 
@@ -60,4 +62,6 @@ typedef struct { pteval_t pte; } pte_t;
 #define MODULES_END      _AC(0xffffffffff000000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
 
+#define EARLY_DYNAMIC_PAGE_TABLES      64
+
 #endif /* _ASM_X86_PGTABLE_64_DEFS_H */
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 037df57..9443c77 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -26,11 +26,73 @@
 #include <asm/e820.h>
 #include <asm/bios_ebda.h>
 
-static void __init zap_identity_mappings(void)
+/*
+ * Manage page tables very early on.
+ */
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD];
+static unsigned int __initdata next_early_pgt = 2, early_pgt_resets = 0;
+
+/* Wipe all early page tables except for the kernel symbol map */
+static void __init reset_early_page_tables(void)
 {
-       pgd_t *pgd = pgd_offset_k(0UL);
-       pgd_clear(pgd);
-       __flush_tlb_all();
+       unsigned long i;
+
+       for (i = 0; i < PTRS_PER_PGD-1; i++)
+               early_level4_pgt[i].pgd = 0;
+
+       next_early_pgt = 0;
+       early_pgt_resets++;
+
+       __native_flush_tlb();
+}
+
+/* Create a new PMD entry */
+int __init early_make_pgtable(unsigned long address)
+{
+       unsigned long physaddr = address - __PAGE_OFFSET;
+       unsigned long i;
+       pgdval_t pgd, *pgd_p;
+       pudval_t *pud_p;
+       pmdval_t pmd, *pmd_p;
+
+       if (physaddr >= MAXMEM)
+               return -1;      /* Invalid address - puke */
+
+       i = (address >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1);
+       pgd_p = &early_level4_pgt[i].pgd;
+       pgd = *pgd_p;
+
+       /*
+        * The use of __START_KERNEL_map rather than __PAGE_OFFSET here is
+        * critical -- __PAGE_OFFSET would point us back into the dynamic
+        * range and we might end up looping forever...
+        */
+       if (pgd && next_early_pgt < EARLY_DYNAMIC_PAGE_TABLES) {
+               pud_p = (pudval_t *)((pgd & PTE_PFN_MASK) + __START_KERNEL_map);
+       } else {
+               if (next_early_pgt >= EARLY_DYNAMIC_PAGE_TABLES-1)
+                       reset_early_page_tables();
+
+               pud_p = (pudval_t *)early_dynamic_pgts[next_early_pgt++];
+               for (i = 0; i < PTRS_PER_PUD; i++)
+                       pud_p[i] = 0;
+
+               *pgd_p = (pgdval_t)pud_p - __START_KERNEL_map + _KERNPG_TABLE;
+       }
+       i = (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
+       pud_p += i;
+
+       pmd_p = (pmdval_t *)early_dynamic_pgts[next_early_pgt++];
+       pmd = (physaddr & PUD_MASK) + (__PAGE_KERNEL_LARGE & ~_PAGE_GLOBAL);
+       for (i = 0; i < PTRS_PER_PMD; i++) {
+               pmd_p[i] = pmd;
+               pmd += PMD_SIZE;
+       }
+
+       *pud_p = (pudval_t)pmd_p - __START_KERNEL_map + _KERNPG_TABLE;
+
+       return 0;
 }
 
 /* Don't add a printk in there. printk relies on the PDA which is not 
initialized 
@@ -70,12 +132,13 @@ void __init x86_64_start_kernel(char * real_mode_data)
                                (__START_KERNEL & PGDIR_MASK)));
        BUILD_BUG_ON(__fix_to_virt(__end_of_fixed_addresses) <= MODULES_END);
 
+       /* Kill off the identity-map trampoline */
+       reset_early_page_tables();
+
        /* clear bss before set_intr_gate with early_idt_handler */
        clear_bss();
 
-       /* Make NULL pointers segfault */
-       zap_identity_mappings();
-
+       /* XXX - this is wrong... we need to build page tables from scratch */
        max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
 
        for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 94bf9cc..d539692 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -47,14 +47,13 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
        .code64
        .globl startup_64
 startup_64:
-
        /*
         * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
         * and someone has loaded an identity mapped page table
         * for us.  These identity mapped page tables map all of the
         * kernel pages and possibly all of memory.
         *
-        * %esi holds a physical pointer to real_mode_data.
+        * %rsi holds a physical pointer to real_mode_data.
         *
         * We come here either directly from a 64bit bootloader, or from
         * arch/x86_64/boot/compressed/head.S.
@@ -66,7 +65,8 @@ startup_64:
         * tables and then reload them.
         */
 
-       /* Compute the delta between the address I am compiled to run at and the
+       /*
+        * Compute the delta between the address I am compiled to run at and the
         * address I am actually running at.
         */
        leaq    _text(%rip), %rbp
@@ -78,53 +78,66 @@ startup_64:
        testl   %eax, %eax
        jnz     bad_address
 
-       /* Is the address too large? */
-       leaq    _text(%rip), %rdx
-       movq    $PGDIR_SIZE, %rax
-       cmpq    %rax, %rdx
-       jae     bad_address
-
-       /* Fixup the physical addresses in the page table
+       /*
+        * Is the address too large?
         */
-       addq    %rbp, init_level4_pgt + 0(%rip)
-       addq    %rbp, init_level4_pgt + (L4_PAGE_OFFSET*8)(%rip)
-       addq    %rbp, init_level4_pgt + (L4_START_KERNEL*8)(%rip)
+       leaq    _text(%rip), %rax
+       shrq    $MAX_PHYSMEM_BITS, %rax
+       jnz     bad_address
 
-       addq    %rbp, level3_ident_pgt + 0(%rip)
+       /*
+        * Fixup the physical addresses in the page table
+        */
+       addq    %rbp, early_level4_pgt + (L4_START_KERNEL*8)(%rip)
 
        addq    %rbp, level3_kernel_pgt + (510*8)(%rip)
        addq    %rbp, level3_kernel_pgt + (511*8)(%rip)
 
        addq    %rbp, level2_fixmap_pgt + (506*8)(%rip)
 
-       /* Add an Identity mapping if I am above 1G */
+       /*
+        * Set up the identity mapping for the switchover.  These
+        * entries should *NOT* have the global bit set!  This also
+        * creates a bunch of nonsense entries but that is fine --
+        * it avoids problems around wraparound.
+        */
        leaq    _text(%rip), %rdi
-       andq    $PMD_PAGE_MASK, %rdi
+       leaq    early_level4_pgt(%rip), %rbx
 
        movq    %rdi, %rax
-       shrq    $PUD_SHIFT, %rax
-       andq    $(PTRS_PER_PUD - 1), %rax
-       jz      ident_complete
+       shrq    $PGDIR_SHIFT, %rax
 
-       leaq    (level2_spare_pgt - __START_KERNEL_map + _KERNPG_TABLE)(%rbp), 
%rdx
-       leaq    level3_ident_pgt(%rip), %rbx
-       movq    %rdx, 0(%rbx, %rax, 8)
+       leaq    (4096 + _KERNPG_TABLE)(%rbx), %rdx
+       movq    %rdx, 0(%rbx,%rax,8)
+       movq    %rdx, 8(%rbx,%rax,8)
 
+       addq    $4096, %rdx
        movq    %rdi, %rax
-       shrq    $PMD_SHIFT, %rax
-       andq    $(PTRS_PER_PMD - 1), %rax
-       leaq    __PAGE_KERNEL_IDENT_LARGE_EXEC(%rdi), %rdx
-       leaq    level2_spare_pgt(%rip), %rbx
-       movq    %rdx, 0(%rbx, %rax, 8)
-ident_complete:
+       shrq    $PUD_SHIFT, %rax
+       andl    $(PTRS_PER_PUD-1), %eax
+       movq    %rdx, (4096+0)(%rbx,%rax,8)
+       movq    %rdx, (4096+8)(%rbx,%rax,8)
 
+       addq    $8192, %rbx
+       movq    %rdi, %rax
+       shrq    $PMD_SHIFT, %rdi
+       addq    $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax
+       movl    $PTRS_PER_PMD, %ecx
+
+1:
+       andq    $(PTRS_PER_PMD - 1), %rdi
+       movq    %rax, (%rbx,%rdi,8)
+       incq    %rdi
+       addq    $PMD_SIZE, %rax
+       decl    %ecx
+       jnz     1b
+       
        /*
         * Fixup the kernel text+data virtual addresses. Note that
         * we might write invalid pmds, when the kernel is relocated
         * cleanup_highmap() fixes this up along with the mappings
         * beyond _end.
         */
-
        leaq    level2_kernel_pgt(%rip), %rdi
        leaq    4096(%rdi), %r8
        /* See if it is a valid page table entry */
@@ -139,17 +152,14 @@ ident_complete:
        /* Fixup phys_base */
        addq    %rbp, phys_base(%rip)
 
-       /* Due to ENTRY(), sometimes the empty space gets filled with
-        * zeros. Better take a jmp than relying on empty space being
-        * filled with 0x90 (nop)
-        */
-       jmp secondary_startup_64
+       movq    $(early_level4_pgt - __START_KERNEL_map), %rax
+       jmp 1f
 ENTRY(secondary_startup_64)
        /*
         * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 1,
         * and someone has loaded a mapped page table.
         *
-        * %esi holds a physical pointer to real_mode_data.
+        * %rsi holds a physical pointer to real_mode_data.
         *
         * We come here either from startup_64 (using physical addresses)
         * or from trampoline.S (using virtual addresses).
@@ -159,12 +169,14 @@ ENTRY(secondary_startup_64)
         * after the boot processor executes this code.
         */
 
+       movq    $(init_level4_pgt - __START_KERNEL_map), %rax
+1:
+
        /* Enable PAE mode and PGE */
-       movl    $(X86_CR4_PAE | X86_CR4_PGE), %eax
-       movq    %rax, %cr4
+       movl    $(X86_CR4_PAE | X86_CR4_PGE), %ecx
+       movq    %rcx, %cr4
 
        /* Setup early boot stage 4 level pagetables. */
-       movq    $(init_level4_pgt - __START_KERNEL_map), %rax
        addq    phys_base(%rip), %rax
        movq    %rax, %cr3
 
@@ -196,7 +208,7 @@ ENTRY(secondary_startup_64)
        movq    %rax, %cr0
 
        /* Setup a boot time stack */
-       movq stack_start(%rip),%rsp
+       movq stack_start(%rip), %rsp
 
        /* zero EFLAGS after setting rsp */
        pushq $0
@@ -236,31 +248,31 @@ ENTRY(secondary_startup_64)
        movl    initial_gs+4(%rip),%edx
        wrmsr   
 
-       /* esi is pointer to real mode structure with interesting info.
+       /* rsi is pointer to real mode structure with interesting info.
           pass it to C */
-       movl    %esi, %edi
+       movq    %rsi, %rdi
        
        /* Finally jump to run C code and to be on real kernel address
         * Since we are running on identity-mapped space we have to jump
         * to the full 64bit address, this is only possible as indirect
         * jump.  In addition we need to ensure %cs is set so we make this
-        * a far return.
+        * a far jump.
         */
-       movq    initial_code(%rip),%rax
        pushq   $0              # fake return address to stop unwinder
-       pushq   $__KERNEL_CS    # set correct cs
-       pushq   %rax            # target address in negative space
-       lretq
+       /* gas 2.22 is buggy and mis-assembles ljmpq */
+       rex64 ljmp *initial_code(%rip)
 
        /* SMP bootup changes these two */
        __REFDATA
-       .align  8
-       ENTRY(initial_code)
+       .balign 8
+       GLOBAL(initial_code)
        .quad   x86_64_start_kernel
-       ENTRY(initial_gs)
+       .word   __KERNEL_CS
+       .balign 8
+       GLOBAL(initial_gs)
        .quad   INIT_PER_CPU_VAR(irq_stack_union)
 
-       ENTRY(stack_start)
+       GLOBAL(stack_start)
        .quad  init_thread_union+THREAD_SIZE-8
        .word  0
        __FINITDATA
@@ -268,7 +280,7 @@ ENTRY(secondary_startup_64)
 bad_address:
        jmp bad_address
 
-       .section ".init.text","ax"
+       __INIT
        .globl early_idt_handlers
 early_idt_handlers:
        # 104(%rsp) %rflags
@@ -305,14 +317,22 @@ ENTRY(early_idt_handler)
        pushq %r11              #  0(%rsp)
 
        cmpl $__KERNEL_CS,96(%rsp)
-       jne 10f
+       jne 11f
 
+       cmpl $14,72(%rsp)       # Page fault?
+       jnz 10f
+       GET_CR2_INTO(%rdi)      # can clobber any volatile register if pv
+       call early_make_pgtable
+       andl %eax,%eax
+       jz 20f                  # All good
+
+10:
        leaq 88(%rsp),%rdi      # Pointer to %rip
        call early_fixup_exception
        andl %eax,%eax
        jnz 20f                 # Found an exception entry
 
-10:
+11:
 #ifdef CONFIG_EARLY_PRINTK
        GET_CR2_INTO(%r9)       # can clobber any volatile register if pv
        movl 80(%rsp),%r8d      # error code
@@ -334,7 +354,7 @@ ENTRY(early_idt_handler)
 1:     hlt
        jmp 1b
 
-20:    # Exception table entry found
+20:    # Exception table entry found or page table generated
        popq %r11
        popq %r10
        popq %r9
@@ -348,6 +368,8 @@ ENTRY(early_idt_handler)
        decl early_recursion_flag(%rip)
        INTERRUPT_RETURN
 
+       __INITDATA
+       
        .balign 4
 early_recursion_flag:
        .long 0
@@ -358,11 +380,10 @@ early_idt_msg:
 early_idt_ripmsg:
        .asciz "RIP %s\n"
 #endif /* CONFIG_EARLY_PRINTK */
-       .previous
 
 #define NEXT_PAGE(name) \
        .balign PAGE_SIZE; \
-ENTRY(name)
+GLOBAL(name)
 
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)                       \
@@ -372,46 +393,21 @@ ENTRY(name)
        i = i + 1 ;                                     \
        .endr
 
-       .data
-       /*
-        * This default setting generates an ident mapping at address 0x100000
-        * and a mapping for the kernel that precisely maps virtual address
-        * 0xffffffff80000000 to physical address 0x000000. (always using
-        * 2Mbyte large pages provided by PAE mode)
-        */
-NEXT_PAGE(init_level4_pgt)
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
-       .quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .org    init_level4_pgt + L4_START_KERNEL*8, 0
-       /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
+       __INITDATA
+NEXT_PAGE(early_level4_pgt)
+       .fill   511,8,0
        .quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 
-NEXT_PAGE(level3_ident_pgt)
-       .quad   level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
-       .fill   511,8,0
+NEXT_PAGE(early_dynamic_pgts)
+       .fill   512*EARLY_DYNAMIC_PAGE_TABLES,8,0
 
+       .data
 NEXT_PAGE(level3_kernel_pgt)
        .fill   L3_START_KERNEL,8,0
        /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */
        .quad   level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE
        .quad   level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
 
-NEXT_PAGE(level2_fixmap_pgt)
-       .fill   506,8,0
-       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
-       /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
-       .fill   5,8,0
-
-NEXT_PAGE(level1_fixmap_pgt)
-       .fill   512,8,0
-
-NEXT_PAGE(level2_ident_pgt)
-       /* Since I easily can, map the first 1G.
-        * Don't set NX because code runs from these pages.
-        */
-       PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
-
 NEXT_PAGE(level2_kernel_pgt)
        /*
         * 512 MB kernel mapping. We spend a full page on this pagetable
@@ -426,11 +422,16 @@ NEXT_PAGE(level2_kernel_pgt)
        PMDS(0, __PAGE_KERNEL_LARGE_EXEC,
                KERNEL_IMAGE_SIZE/PMD_SIZE)
 
-NEXT_PAGE(level2_spare_pgt)
-       .fill   512, 8, 0
+NEXT_PAGE(level2_fixmap_pgt)
+       .fill   506,8,0
+       .quad   level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE
+       /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */
+       .fill   5,8,0
+
+NEXT_PAGE(level1_fixmap_pgt)
+       .fill   512,8,0
 
 #undef PMDS
-#undef NEXT_PAGE
 
        .data
        .align 16
@@ -456,6 +457,7 @@ ENTRY(nmi_idt_table)
        .skip IDT_ENTRIES * 16
 
        __PAGE_ALIGNED_BSS
-       .align PAGE_SIZE
-ENTRY(empty_zero_page)
+NEXT_PAGE(empty_zero_page)
+       .skip PAGE_SIZE
+NEXT_PAGE(init_level4_pgt)
        .skip PAGE_SIZE
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index ca45696..e383050 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -911,7 +911,6 @@ void __init setup_arch(char **cmdline_p)
                        (max_pfn_mapped<<PAGE_SHIFT) - 1);
 
        setup_real_mode();
-
        init_gbpages();
 
        /* max_pfn_mapped is updated here */
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index cbca565..1650bf4 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -21,7 +21,7 @@ void __init setup_real_mode(void)
        struct trampoline_header *trampoline_header;
        size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
 #ifdef CONFIG_X86_64
-       u64 *trampoline_pgd;
+       pgd_t *trampoline_pgd;
        u64 efer;
 #endif
 
@@ -77,9 +77,17 @@ void __init setup_real_mode(void)
        trampoline_cr4_features = &trampoline_header->cr4;
        *trampoline_cr4_features = read_cr4();
 
-       trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
-       trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE;
-       trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE;
+       trampoline_pgd = (pgd_t *) __va(real_mode_header->trampoline_pgd);
+
+       /* Set up the identity map */
+       clone_pgd_range(trampoline_pgd,
+                       init_level4_pgt + KERNEL_PGD_BOUNDARY,
+                       MAXMEM >> PGDIR_SHIFT);
+
+       /* Set up the kernel map */
+       clone_pgd_range(trampoline_pgd  + KERNEL_PGD_BOUNDARY,
+                       init_level4_pgt + KERNEL_PGD_BOUNDARY,
+                       PTRS_PER_PGD - KERNEL_PGD_BOUNDARY);
 #endif
 }
 

Reply via email to