expanding amd64 past the 1TB limit

Chris Torek torek at torek.net
Mon Jul 8 06:42:45 UTC 2013


Here is a final (I hope) version of the patch.  I dropped the
config option, but I added code to limit the "real" size of the
direct map PDEs.  The end result is that on small systems, this
ties up 14 more pages (15 from increasing NKPML4E, but one
regained because the new static variable ndmpdpphys is 1 instead
of 2).

(I fixed the comment errors I spotted earlier, too.)

Chris

 amd64/amd64/pmap.c      | 100 +++++++++++++++++++++++++++++-------------------
 amd64/include/pmap.h    |  36 +++++++++++++----
 amd64/include/vmparam.h |  13 ++++---
 3 files changed, 97 insertions(+), 52 deletions(-)

Author: Chris Torek <chris.torek at gmail.com>
Date:   Thu Jun 27 18:49:29 2013 -0600

    increase physical and virtual memory limits
    
    Increase kernel VM space: go from .5 TB of KVA and 1 TB of direct
    map, to 8 TB of KVA and 16 TB of direct map.  However, we allocate
    less direct map space for small physical-memory systems.  Also, if
    Maxmem is so large that there is not enough direct map space,
    reduce Maxmem to fit, so that the system can boot unassisted.

diff --git a/amd64/amd64/pmap.c b/amd64/amd64/pmap.c
index 8dcf232..7368c96 100644
--- a/amd64/amd64/pmap.c
+++ b/amd64/amd64/pmap.c
@@ -232,6 +232,7 @@ u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
 
 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
+static int		ndmpdpphys;	/* number of DMPDPphys pages */
 
 static struct rwlock_padalign pvh_global_lock;
 
@@ -531,12 +532,27 @@ static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
 	int i, j, ndm1g, nkpdpe;
+	pt_entry_t *pt_p;
+	pd_entry_t *pd_p;
+	pdp_entry_t *pdp_p;
+	pml4_entry_t *p4_p;
 
 	/* Allocate page table pages for the direct map */
 	ndmpdp = (ptoa(Maxmem) + NBPDP - 1) >> PDPSHIFT;
 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
 		ndmpdp = 4;
-	DMPDPphys = allocpages(firstaddr, NDMPML4E);
+	ndmpdpphys = howmany(ndmpdp, NPML4EPG);
+	if (ndmpdpphys > NDMPML4E) {
+		/*
+		 * Each NDMPML4E allows 512 GB, so limit to that,
+		 * and then readjust ndmpdp and ndmpdpphys.
+		 */
+		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
+		Maxmem = atop(NDMPML4E * NBPML4);
+		ndmpdpphys = NDMPML4E;
+		ndmpdp = NDMPML4E * NPDEPG;
+	}
+	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
 	ndm1g = 0;
 	if ((amd_feature & AMDID_PAGE1GB) != 0)
 		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
@@ -553,6 +569,10 @@ create_pagetables(vm_paddr_t *firstaddr)
 	 * bootstrap.  We defer this until after all memory-size dependent
 	 * allocations are done (e.g. direct map), so that we don't have to
 	 * build in too much slop in our estimate.
+	 *
+	 * Note that when NKPML4E > 1, we have an empty page underneath
+	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
+	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
 	 */
 	nkpt_init(*firstaddr);
 	nkpdpe = NKPDPE(nkpt);
@@ -561,32 +581,26 @@ create_pagetables(vm_paddr_t *firstaddr)
 	KPDphys = allocpages(firstaddr, nkpdpe);
 
 	/* Fill in the underlying page table pages */
-	/* Read-only from zero to physfree */
+	/* Nominally read-only (but really R/W) from zero to physfree */
 	/* XXX not fully used, underneath 2M pages */
-	for (i = 0; (i << PAGE_SHIFT) < *firstaddr; i++) {
-		((pt_entry_t *)KPTphys)[i] = i << PAGE_SHIFT;
-		((pt_entry_t *)KPTphys)[i] |= PG_RW | PG_V | PG_G;
-	}
+	pt_p = (pt_entry_t *)KPTphys;
+	for (i = 0; ptoa(i) < *firstaddr; i++)
+		pt_p[i] = ptoa(i) | PG_RW | PG_V | PG_G;
 
 	/* Now map the page tables at their location within PTmap */
-	for (i = 0; i < nkpt; i++) {
-		((pd_entry_t *)KPDphys)[i] = KPTphys + (i << PAGE_SHIFT);
-		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V;
-	}
+	pd_p = (pd_entry_t *)KPDphys;
+	for (i = 0; i < nkpt; i++)
+		pd_p[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
 
 	/* Map from zero to end of allocations under 2M pages */
 	/* This replaces some of the KPTphys entries above */
-	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++) {
-		((pd_entry_t *)KPDphys)[i] = i << PDRSHIFT;
-		((pd_entry_t *)KPDphys)[i] |= PG_RW | PG_V | PG_PS | PG_G;
-	}
+	for (i = 0; (i << PDRSHIFT) < *firstaddr; i++)
+		pd_p[i] = (i << PDRSHIFT) | PG_RW | PG_V | PG_PS | PG_G;
 
-	/* And connect up the PD to the PDP */
-	for (i = 0; i < nkpdpe; i++) {
-		((pdp_entry_t *)KPDPphys)[i + KPDPI] = KPDphys +
-		    (i << PAGE_SHIFT);
-		((pdp_entry_t *)KPDPphys)[i + KPDPI] |= PG_RW | PG_V | PG_U;
-	}
+	/* And connect up the PD to the PDP (leaving room for L4 pages) */
+	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
+	for (i = 0; i < nkpdpe; i++)
+		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | PG_RW | PG_V | PG_U;
 
 	/*
 	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
@@ -596,37 +610,41 @@ create_pagetables(vm_paddr_t *firstaddr)
 	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
 	 * that are partially used. 
 	 */
+	pd_p = (pd_entry_t *)DMPDphys;
 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
-		((pd_entry_t *)DMPDphys)[j] = (vm_paddr_t)i << PDRSHIFT;
+		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		((pd_entry_t *)DMPDphys)[j] |= PG_RW | PG_V | PG_PS | PG_G |
+		pd_p[j] |= PG_RW | PG_V | PG_PS | PG_G |
 		    PG_M | PG_A;
 	}
+	pdp_p = (pdp_entry_t *)DMPDPphys;
 	for (i = 0; i < ndm1g; i++) {
-		((pdp_entry_t *)DMPDPphys)[i] = (vm_paddr_t)i << PDPSHIFT;
+		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
-		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_PS | PG_G |
+		pdp_p[i] |= PG_RW | PG_V | PG_PS | PG_G |
 		    PG_M | PG_A;
 	}
 	for (j = 0; i < ndmpdp; i++, j++) {
-		((pdp_entry_t *)DMPDPphys)[i] = DMPDphys + (j << PAGE_SHIFT);
-		((pdp_entry_t *)DMPDPphys)[i] |= PG_RW | PG_V | PG_U;
+		pdp_p[i] = DMPDphys + ptoa(j);
+		pdp_p[i] |= PG_RW | PG_V | PG_U;
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
-	((pdp_entry_t *)KPML4phys)[PML4PML4I] = KPML4phys;
-	((pdp_entry_t *)KPML4phys)[PML4PML4I] |= PG_RW | PG_V | PG_U;
+	p4_p = (pml4_entry_t *)KPML4phys;
+	p4_p[PML4PML4I] = KPML4phys;
+	p4_p[PML4PML4I] |= PG_RW | PG_V | PG_U;
 
 	/* Connect the Direct Map slot(s) up to the PML4. */
-	for (i = 0; i < NDMPML4E; i++) {
-		((pdp_entry_t *)KPML4phys)[DMPML4I + i] = DMPDPphys +
-		    (i << PAGE_SHIFT);
-		((pdp_entry_t *)KPML4phys)[DMPML4I + i] |= PG_RW | PG_V | PG_U;
+	for (i = 0; i < ndmpdpphys; i++) {
+		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
+		p4_p[DMPML4I + i] |= PG_RW | PG_V | PG_U;
 	}
 
-	/* Connect the KVA slot up to the PML4 */
-	((pdp_entry_t *)KPML4phys)[KPML4I] = KPDPphys;
-	((pdp_entry_t *)KPML4phys)[KPML4I] |= PG_RW | PG_V | PG_U;
+	/* Connect the KVA slots up to the PML4 */
+	for (i = 0; i < NKPML4E; i++) {
+		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
+		p4_p[KPML4BASE + i] |= PG_RW | PG_V | PG_U;
+	}
 }
 
 /*
@@ -1685,8 +1703,11 @@ pmap_pinit(pmap_t pmap)
 		pagezero(pmap->pm_pml4);
 
 	/* Wire in kernel global address entries. */
-	pmap->pm_pml4[KPML4I] = KPDPphys | PG_RW | PG_V | PG_U;
-	for (i = 0; i < NDMPML4E; i++) {
+	for (i = 0; i < NKPML4E; i++) {
+		pmap->pm_pml4[KPML4BASE + i] = (KPDPphys + (i << PAGE_SHIFT)) |
+		    PG_RW | PG_V | PG_U;
+	}
+	for (i = 0; i < ndmpdpphys; i++) {
 		pmap->pm_pml4[DMPML4I + i] = (DMPDPphys + (i << PAGE_SHIFT)) |
 		    PG_RW | PG_V | PG_U;
 	}
@@ -1941,8 +1962,9 @@ pmap_release(pmap_t pmap)
 
 	m = PHYS_TO_VM_PAGE(pmap->pm_pml4[PML4PML4I] & PG_FRAME);
 
-	pmap->pm_pml4[KPML4I] = 0;	/* KVA */
-	for (i = 0; i < NDMPML4E; i++)	/* Direct Map */
+	for (i = 0; i < NKPML4E; i++)	/* KVA */
+		pmap->pm_pml4[KPML4BASE + i] = 0;
+	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
 		pmap->pm_pml4[DMPML4I + i] = 0;
 	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
 
diff --git a/amd64/include/pmap.h b/amd64/include/pmap.h
index dc02e49..da80241 100644
--- a/amd64/include/pmap.h
+++ b/amd64/include/pmap.h
@@ -113,28 +113,50 @@
 	((unsigned long)(l2) << PDRSHIFT) | \
 	((unsigned long)(l1) << PAGE_SHIFT))
 
-#define NKPML4E		1		/* number of kernel PML4 slots */
+/*
+ * Number of kernel PML4 slots.  Can be anywhere from 1 to 64 or so,
+ * but setting it larger than NDMPML4E makes no sense.
+ *
+ * Each slot provides .5 TB of kernel virtual space.
+ */
+#define NKPML4E		16
 
 #define	NUPML4E		(NPML4EPG/2)	/* number of userland PML4 pages */
 #define	NUPDPE		(NUPML4E*NPDPEPG)/* number of userland PDP pages */
 #define	NUPDE		(NUPDPE*NPDEPG)	/* number of userland PD entries */
 
 /*
- * NDMPML4E is the number of PML4 entries that are used to implement the
- * direct map.  It must be a power of two.
+ * NDMPML4E is the maximum number of PML4 entries that will be
+ * used to implement the direct map.  It must be a power of two,
+ * and should generally exceed NKPML4E.  The maximum possible
+ * value is 64; using 128 will make the direct map intrude into
+ * the recursive page table map.
  */
-#define	NDMPML4E	2
+#define	NDMPML4E	32
 
 /*
- * The *PDI values control the layout of virtual memory.  The starting address
+ * These values control the layout of virtual memory.  The starting address
  * of the direct map, which is controlled by DMPML4I, must be a multiple of
  * its size.  (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.)
+ *
+ * Note: KPML4I is the index of the (single) level 4 page that maps
+ * the KVA that holds KERNBASE, while KPML4BASE is the index of the
+ * first level 4 page that maps VM_MIN_KERNEL_ADDRESS.  If NKPML4E
+ * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra
+ * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to
+ * KERNBASE.  Similarly, if KMPL4I < (base+N), extra level 4 PDEs are
+ * needed to map from somewhere-above-KERNBASE to VM_MAX_KERNEL_ADDRESS.
+ *
+ * (KPML4I combines with KPDPI to choose where KERNBASE starts.
+ * Or, in other words, KPML4I provides bits 39..46 of KERNBASE,
+ * and KPDPI provides bits 30..38.)
  */
 #define	PML4PML4I	(NPML4EPG/2)	/* Index of recursive pml4 mapping */
 
-#define	KPML4I		(NPML4EPG-1)	/* Top 512GB for KVM */
-#define	DMPML4I		rounddown(KPML4I - NDMPML4E, NDMPML4E) /* Below KVM */
+#define	KPML4BASE	(NPML4EPG-NKPML4E) /* KVM at highest addresses */
+#define	DMPML4I		rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */
 
+#define	KPML4I		(NPML4EPG-1)
 #define	KPDPI		(NPDPEPG-2)	/* kernbase at -2GB */
 
 /*
diff --git a/amd64/include/vmparam.h b/amd64/include/vmparam.h
index 33f62bd..cff2558 100644
--- a/amd64/include/vmparam.h
+++ b/amd64/include/vmparam.h
@@ -145,18 +145,19 @@
  * 0x0000000000000000 - 0x00007fffffffffff   user map
  * 0x0000800000000000 - 0xffff7fffffffffff   does not exist (hole)
  * 0xffff800000000000 - 0xffff804020100fff   recursive page table (512GB slot)
- * 0xffff804020101000 - 0xfffffdffffffffff   unused
- * 0xfffffe0000000000 - 0xfffffeffffffffff   1TB direct map
- * 0xffffff0000000000 - 0xffffff7fffffffff   unused
- * 0xffffff8000000000 - 0xffffffffffffffff   512GB kernel map
+ * 0xffff804020101000 - 0xffffdfffffffffff   unused
+ * 0xffffe00000000000 - 0xffffefffffffffff   16TB direct map
+ * 0xfffff00000000000 - 0xfffff7ffffffffff   unused
+ * 0xfffff80000000000 - 0xffffffffffffffff   8TB kernel map
  *
  * Within the kernel map:
  *
  * 0xffffffff80000000                        KERNBASE
  */
 
-#define	VM_MAX_KERNEL_ADDRESS	KVADDR(KPML4I, NPDPEPG-1, NPDEPG-1, NPTEPG-1)
-#define	VM_MIN_KERNEL_ADDRESS	KVADDR(KPML4I, NPDPEPG-512, 0, 0)
+#define	VM_MIN_KERNEL_ADDRESS	KVADDR(KPML4BASE, 0, 0, 0)
+#define	VM_MAX_KERNEL_ADDRESS	KVADDR(KPML4BASE + NKPML4E - 1, \
+					NPDPEPG-1, NPDEPG-1, NPTEPG-1)
 
 #define	DMAP_MIN_ADDRESS	KVADDR(DMPML4I, 0, 0, 0)
 #define	DMAP_MAX_ADDRESS	KVADDR(DMPML4I + NDMPML4E, 0, 0, 0)


More information about the freebsd-hackers mailing list