(Note: Last week I asked about this on the freebsd-current list.
It turned out slightly harder than I thought, as the 512GB kernel
virtual area is based on what fits into a single L4 page table
entry.)

I was asked to expand the kernel limits for amd64 systems.  While
I do not have a system with enough RAM to test this for real, the
changes below seem to boot and run OK.

I went just a little bit wild in create_pagetables(). :-)  The
lines with the casts got long (and hard to read) so I shortened
them (but I still needed the map I drew of the page tables...).
If using ptoa() like this is OK, probably there should be a few
more of those, e.g., in the changes to pmap_pinit().

Anyway, I wonder if some form of this patch (perhaps even without
the #ifdefs) might be accepted back.  I'm not sure about the KPML4BASE
name, but it clearly needs to be different from KPML4I.  (At first
I was considering moving KERNBASE too but the branch offsets seem
to be the real limiting factor here.)

Possibly dumb question: around the comment "this replaces some of
the KPTphys entries above", would it be possible to reclaim a few
pages by calculating in advance where the 2MB page mappings obviate
the need for the underlying KPTphys pages, and just offset things?

Another note: one could get rid of the "power of 2" requirement
for NDMPML4E.  It arises from the translation between direct
mapped virtual and physical addresses (being |= and &=~), but the
same result can be achieved by adding and subtracting an offset,
which would allow the base and limit to be arbitrary, rather than
a power of two.  (Still, it did not seem worth doing here.)

Chris

diff --git a/amd64/amd64/pmap.c b/amd64/amd64/pmap.c
index 272158d..acf5af2 100644
--- a/amd64/amd64/pmap.c
+++ b/amd64/amd64/pmap.c
@@ -534,6 +534,10 @@ static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
        int i, j, ndm1g, nkpdpe;
+       pt_entry_t *pt_p;
+       pd_entry_t *pd_p;
+       pdp_entry_t *pdp_p;
+       pml4_entry_t *p4_p;
 
        /* Allocate page table pages for the direct map */
        ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
@@ -556,6 +560,10 @@ create_pagetables(vm_paddr_t *firstaddr)
         * bootstrap.  We defer this until after all memory-size dependent
         * allocations are done (e.g. direct map), so that we don't have to
         * build in too much slop in our estimate.
+        *
+        * Note that when NKPML4E > 1, we have an empty page underneath
+        * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
+        * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
         */
        nkpt_init(*firstaddr);
        nkpdpe = NKPDPE(nkpt);
@@ -564,32 +572,26 @@ create_pagetables(vm_paddr_t *firstaddr)
        KPDphys = allocpages(firstaddr, nkpdpe);
 
        /* Fill in the underlying page table pages */
-       /* Read-only from zero to physfree */
+       /* Nominally read-only (but really R/W) from zero to physfree */
        /* XXX not fully used, underneath 2M pages */
-       for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
-               ((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
-               ((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
-       }
+       pt_p = (pt_entry_t *)KPTphys;
+       for (i = 0; ptoa(i) < *firstaddr; i++)
+               pt_p[i] = ptoa(i) | PG_RW | PG_V | PG_G;
 
        /* Now map the page tables at their location within PTmap */
-       for (i = 0; i < nkpt; i++) {
-               ((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
-               ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
-       }
+       pd_p = (pd_entry_t *)KPDphys;
+       for (i = 0; i < nkpt; i++)
+               pd_p[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
 
        /* Map from zero to end of allocations under 2M pages */
        /* This replaces some of the KPTphys entries above */
-       for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
-               ((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
-               ((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
-       }
+       for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
+               pd_p[i] = (i << PDRSHIFT) | PG_RW | PG_V | PG_PS | PG_G;
 
-       /* And connect up the PD to the PDP */
-       for (i = 0; i < nkpdpe; i++) {
-               ((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
-                   (i << PAGE_SHIFT);
-               ((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
-       }
+       /* And connect up the PD to the PDP (leaving room for L4 pages) */
+       pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
+       for (i = 0; i < nkpdpe; i++)
+               pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | PG_RW | PG_V | PG_U;
 
        /*
         * Now, set up the direct map region using 2MB and/or 1GB pages.  If
@@ -599,37 +601,41 @@ create_pagetables(vm_paddr_t *firstaddr)
         * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
         * that are partially used. 
         */
+       pd_p = (pd_entry_t *)DMPDphys;
        for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
-               ((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT;
+               pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
                /* Preset PG_M and PG_A because demotion expects it. */
-               ((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G |
+               pd_p[j] |= PG_RW | PG_V | PG_PS | PG_G |
                    PG_M | PG_A;
        }
+       pdp_p = (pdp_entry_t *)DMPDPphys;
        for (i = 0; i < ndm1g; i++) {
-               ((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT;
+               pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
                /* Preset PG_M and PG_A because demotion expects it. */
-               ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G |
+               pdp_p[i] |= PG_RW | PG_V | PG_PS | PG_G |
                    PG_M | PG_A;
        }
        for (j = 0; i < ndmpdp; i++, j++) {
-               ((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT);
-               ((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
+               pdp_p[i] = DMPDphys + ptoa(j);
+               pdp_p[i] |= PG_RW | PG_V | PG_U;
        }
 
        /* And recursively map PML4 to itself in order to get PTmap */
-       ((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
-       ((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
+       p4_p = (pml4_entry_t *)KPML4phys;
+       p4_p[PML4PML4I] = KPML4phys;
+       p4_p[PML4PML4I] |= PG_RW | PG_V | PG_U;
 
        /* Connect the Direct Map slot(s) up to the PML4. */
        for (i = 0; i < NDMPML4E; i++) {
-               ((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys +
-                   (i << PAGE_SHIFT);
-               ((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U;
+               p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
+               p4_p[DMPML4I + i] |= PG_RW | PG_V | PG_U;
        }
 
-       /* Connect the KVA slot up to the PML4 */
-       ((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
-       ((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
+       /* Connect the KVA slots up to the PML4 */
+       for (i = 0; i < NKPML4E; i++) {
+               p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
+               p4_p[KPML4BASE + i] |= PG_RW | PG_V | PG_U;
+       }
 }
 
 /*
@@ -1688,7 +1694,10 @@ pmap_pinit(pmap_t pmap)
                pagezero(pmap->pm_pml4);
 
        /* Wire in kernel global address entries. */
-       pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
+       for (i = 0; i < NKPML4E; i++) {
+               pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + (i << PAGE_SHIFT)) |
+                   PG_RW | PG_V | PG_U;
+       }
        for (i = 0; i < NDMPML4E; i++) {
                pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) |
                    PG_RW | PG_V | PG_U;
@@ -1944,7 +1953,8 @@ pmap_release(pmap_t pmap)
 
        m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
 
-       pmap->pm_pml4[KPML4I] = 0;      /* KVA */
+       for (i = 0; i < NKPML4E; i++)   /* KVA */
+               pmap->pm_pml4[KPML4BASE + i] = 0;
        for (i = 0; i < NDMPML4E; i++)  /* Direct Map */
                pmap->pm_pml4[DMPML4I + i] = 0;
        pmap->pm_pml4[PML4PML4I] = 0;   /* Recursive Mapping */
diff --git a/amd64/include/pmap.h b/amd64/include/pmap.h
index 6d76ec3..58d1c9d 100644
--- a/amd64/include/pmap.h
+++ b/amd64/include/pmap.h
@@ -113,7 +113,17 @@
        ((unsigned long)(l2) << PDRSHIFT) | \
        ((unsigned long)(l1) << PAGE_SHIFT))
 
-#define NKPML4E                1               /* number of kernel PML4 slots 
*/
+/*
+ * Number of kernel PML4 slots.  Can be anywhere from 1 to 64 or so,
+ * but setting it larger than NDMPML4E makes no sense.
+ *
+ * Each slot provides .5 TB of kernel virtual space.
+ */
+#ifdef AMD64_HUGE
+#define NKPML4E                16
+#else
+#define NKPML4E                1
+#endif
 
 #define        NUPML4E         (NPML4EPG/2)    /* number of userland PML4 
pages */
 #define        NUPDPE          (NUPML4E*NPDPEPG)/* number of userland PDP 
pages */
@@ -121,20 +131,39 @@
 
 /*
  * NDMPML4E is the number of PML4 entries that are used to implement the
- * direct map.  It must be a power of two.
+ * direct map.  It must be a power of two, and should generally exceed
+ * NKPML4E.  The maximum possible value is 64; using 128 will make the
+ * direct map intrude into the recursive page table map.
  */
+#ifdef AMD64_HUGE
+#define        NDMPML4E        32
+#else
 #define        NDMPML4E        2
+#endif
 
 /*
- * The *PDI values control the layout of virtual memory.  The starting address
+ * These values control the layout of virtual memory.  The starting address
  * of the direct map, which is controlled by DMPML4I, must be a multiple of
  * its size.  (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
+ *
+ * Note: KPML4I is the index of the (single) level 4 page that maps
+ * the KVA that holds KERNBASE, while KPML4BASE is the index of the
+ * first level 4 page that maps VM_MIN_KERNEL_ADDRESS.  If NKPML4E
+ * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra
+ * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to
+ * KERNBASE.  Similarly, if KMPL4I < NKPML4E, extra level 4 PDEs are
+ * needed to map from somewhere-above-KERNBASE to VM_MAX_KERNEL_ADDRESS.
+ *
+ * (KPML4I combines with KPDPI to choose where KERNBASE starts.
+ * Or, in other words, KPML4I provides bits 39..46 of KERNBASE,
+ * and KPDPI provides bits 30..38.)
  */
 #define        PML4PML4I       (NPML4EPG/2)    /* Index of recursive pml4 
mapping */
 
-#define        KPML4I          (NPML4EPG-1)    /* Top 512GB for KVM */
-#define        DMPML4I         rounddown(KPML4I - NDMPML4E, NDMPML4E) /* Below 
KVM */
+#define        KPML4BASE       (NPML4EPG-NKPML4E) /* KVM at highest addresses 
*/
+#define        DMPML4I         rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* 
Below KVM */
 
+#define        KPML4I          (NPML4EPG-1)
 #define        KPDPI           (NPDPEPG-2)     /* kernbase at -2GB */
 
 /*
diff --git a/amd64/include/vmparam.h b/amd64/include/vmparam.h
index 33f62bd..47a8ef8 100644
--- a/amd64/include/vmparam.h
+++ b/amd64/include/vmparam.h
@@ -145,18 +145,26 @@
  * 0x0000000000000000 - 0x00007fffffffffff   user map
  * 0x0000800000000000 - 0xffff7fffffffffff   does not exist (hole)
  * 0xffff800000000000 - 0xffff804020100fff   recursive page table (512GB slot)
+#ifdef AMD64_HUGE
+ * 0xffff804020101000 - 0xffffdfffffffffff   unused
+ * 0xffffe00000000000 - 0xffffefffffffffff   16TB direct map
+ * 0xfffff00000000000 - 0xfffff7ffffffffff   unused
+ * 0xfffff80000000000 - 0xffffffffffffffff   8TB kernel map
+#else
  * 0xffff804020101000 - 0xfffffdffffffffff   unused
  * 0xfffffe0000000000 - 0xfffffeffffffffff   1TB direct map
  * 0xffffff0000000000 - 0xffffff7fffffffff   unused
  * 0xffffff8000000000 - 0xffffffffffffffff   512GB kernel map
+#endif
  *
  * Within the kernel map:
  *
  * 0xffffffff80000000                        KERNBASE
  */
 
-#define        VM_MAX_KERNEL_ADDRESS   KVADDR(KPML4I, NPDPEPG-1, NPDEPG-1, 
NPTEPG-1)
-#define        VM_MIN_KERNEL_ADDRESS   KVADDR(KPML4I, NPDPEPG-512, 0, 0)
+#define        VM_MIN_KERNEL_ADDRESS   KVADDR(KPML4BASE, 0, 0, 0)
+#define        VM_MAX_KERNEL_ADDRESS   KVADDR(KPML4BASE + NKPML4E - 1, \
+                                       NPDPEPG-1, NPDEPG-1, NPTEPG-1)
 
 #define        DMAP_MIN_ADDRESS        KVADDR(DMPML4I, 0, 0, 0)
 #define        DMAP_MAX_ADDRESS        KVADDR(DMPML4I + NDMPML4E, 0, 0, 0)
diff --git a/conf/options.amd64 b/conf/options.amd64
index 90348b7..f3ce505 100644
--- a/conf/options.amd64
+++ b/conf/options.amd64
@@ -1,6 +1,7 @@
 # $FreeBSD$
 # Options specific to AMD64 platform kernels
 
+AMD64_HUGE             opt_global.h
 AUTO_EOI_1             opt_auto_eoi.h
 AUTO_EOI_2             opt_auto_eoi.h
 COUNT_XINVLTLB_HITS    opt_smp.h
_______________________________________________
freebsd-hackers@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/freebsd-hackers
To unsubscribe, send any mail to "freebsd-hackers-unsubscr...@freebsd.org"

Reply via email to