git: 8ca493ffb446 - stable/13 - amd64: do not assume that kernel is loaded at 2M physical

Konstantin Belousov kib at FreeBSD.org
Mon Aug 23 23:22:22 UTC 2021


The branch stable/13 has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=8ca493ffb44691e70ae92300b8de1c1d30134ef4

commit 8ca493ffb44691e70ae92300b8de1c1d30134ef4
Author:     Konstantin Belousov <kib at FreeBSD.org>
AuthorDate: 2021-07-10 19:48:02 +0000
Commit:     Konstantin Belousov <kib at FreeBSD.org>
CommitDate: 2021-08-23 23:21:13 +0000

    amd64: do not assume that kernel is loaded at 2M physical
    
    (cherry picked from commit e18380e341410ce70d97560a22827591f4b2d373)
---
 sys/amd64/amd64/machdep.c   | 38 ++++++++++++++++++++++--
 sys/amd64/amd64/pmap.c      | 72 +++++++++++++++++++++++++++------------------
 sys/amd64/include/md_var.h  |  7 ++---
 sys/amd64/include/vmparam.h | 16 ++++++++--
 sys/conf/ldscript.amd64     |  5 ++--
 5 files changed, 96 insertions(+), 42 deletions(-)

diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 63f933ad535c..2c8711fd3d2a 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1599,7 +1599,10 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 	int gsel_tss, x;
 	struct pcpu *pc;
 	struct xstate_hdr *xhdr;
-	u_int64_t rsp0;
+	uint64_t cr3, rsp0;
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
+	pd_entry_t *pde;
 	char *env;
 	struct user_segment_descriptor *gdt;
 	struct region_descriptor r_gdt;
@@ -1608,6 +1611,35 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 
 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
 
+	/*
+	 * Calculate kernphys by inspecting page table created by loader.
+	 * The assumptions:
+	 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
+	 *   aligned at 2M, below 4G (the latter is important for AP startup)
+	 * - there is a 2M hole at KERNBASE
+	 * - kernel is mapped with 2M superpages
+	 * - all participating memory, i.e. kernel, modules, metadata,
+	 *   page table is accessible by pre-created 1:1 mapping
+	 *   (right now loader creates 1:1 mapping for lower 4G, and all
+	 *   memory is from there)
+	 * - there is a usable memory block right after the end of the
+	 *   mapped kernel and all modules/metadata, pointed to by
+	 *   physfree, for early allocations
+	 */
+	cr3 = rcr3();
+	pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
+	    (vm_offset_t)hammer_time);
+	pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
+	    (vm_offset_t)hammer_time);
+	pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
+	    (vm_offset_t)hammer_time);
+	kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
+	    (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
+
+	/* Fix-up for 2M hole */
+	physfree += kernphys;
+	kernphys += NBPDR;
+
 	kmdp = init_ops.parse_preload_data(modulep);
 
 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
@@ -1653,7 +1685,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 	/* Init basic tunables, hz etc */
 	init_param1();
 
-	thread0.td_kstack = physfree + KERNBASE;
+	thread0.td_kstack = physfree - kernphys + KERNSTART;
 	thread0.td_kstack_pages = kstack_pages;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	bzero((void *)thread0.td_kstack, kstack0_sz);
@@ -1690,7 +1722,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
 
-	dpcpu_init((void *)(physfree + KERNBASE), 0);
+	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
 	physfree += DPCPU_SIZE;
 	amd64_bsp_pcpu_init1(pc);
 	/* Non-late cninit() and printf() can be moved up to here. */
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index e5d46449c275..d35422924b1f 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -429,7 +429,8 @@ static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 static int		ndmpdpphys;	/* number of DMPDPphys pages */
 
-static vm_paddr_t	KERNend;	/* phys addr of end of bootstrap data */
+vm_paddr_t		kernphys;	/* phys addr of start of bootstrap data */
+vm_paddr_t		KERNend;	/* and the end */
 
 /*
  * pmap_mapdev support pre initialization (i.e. console)
@@ -1532,7 +1533,7 @@ nkpt_init(vm_paddr_t addr)
 #ifdef NKPT
 	pt_pages = NKPT;
 #else
-	pt_pages = howmany(addr, NBPDR);
+	pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */
 	pt_pages += NKPDPE(pt_pages);
 
 	/*
@@ -1572,7 +1573,6 @@ nkpt_init(vm_paddr_t addr)
 static inline pt_entry_t
 bootaddr_rwx(vm_paddr_t pa)
 {
-
 	/*
 	 * The kernel is loaded at a 2MB-aligned address, and memory below that
 	 * need not be executable.  The .bss section is padded to a 2MB
@@ -1580,8 +1580,8 @@ bootaddr_rwx(vm_paddr_t pa)
 	 * either.  Preloaded kernel modules have their mapping permissions
 	 * fixed up by the linker.
 	 */
-	if (pa < trunc_2mpage(btext - KERNBASE) ||
-	    pa >= trunc_2mpage(_end - KERNBASE))
+	if (pa < trunc_2mpage(kernphys + btext - KERNSTART) ||
+	    pa >= trunc_2mpage(kernphys + _end - KERNSTART))
 		return (X86_PG_RW | pg_nx);
 
 	/*
@@ -1590,7 +1590,7 @@ bootaddr_rwx(vm_paddr_t pa)
 	 * impact read-only data. However, in any case, any page with
 	 * read-write data needs to be read-write.
 	 */
-	if (pa >= trunc_2mpage(brwsection - KERNBASE))
+	if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART))
 		return (X86_PG_RW | pg_nx);
 
 	/*
@@ -1602,7 +1602,7 @@ bootaddr_rwx(vm_paddr_t pa)
 	 * Note that fixups to the .text section will still work until we
 	 * set CR0.WP.
 	 */
-	if (pa < round_2mpage(etext - KERNBASE))
+	if (pa < round_2mpage(kernphys + etext - KERNSTART))
 		return (0);
 	return (pg_nx);
 }
@@ -1610,11 +1610,12 @@ bootaddr_rwx(vm_paddr_t pa)
 static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
-	int i, j, ndm1g, nkpdpe, nkdmpde;
 	pd_entry_t *pd_p;
 	pdp_entry_t *pdp_p;
 	pml4_entry_t *p4_p;
 	uint64_t DMPDkernphys;
+	vm_paddr_t pax;
+	int i, j, ndm1g, nkpdpe, nkdmpde;
 
 	/* Allocate page table pages for the direct map */
 	ndmpdp = howmany(ptoa(Maxmem), NBPDP);
@@ -1642,9 +1643,11 @@ create_pagetables(vm_paddr_t *firstaddr)
 
 		/*
 		 * Allocate 2M pages for the kernel. These will be used in
-		 * place of the first one or more 1G pages from ndm1g.
+		 * place of the one or more 1G pages from ndm1g that maps
+		 * kernel memory into DMAP.
 		 */
-		nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
+		nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART +
+		    kernphys - rounddown2(kernphys, NBPDP), NBPDP);
 		DMPDkernphys = allocpages(firstaddr, nkdmpde);
 	}
 	if (ndm1g < ndmpdp)
@@ -1681,14 +1684,18 @@ create_pagetables(vm_paddr_t *firstaddr)
 		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
 	/*
-	 * Map from physical address zero to the end of loader preallocated
-	 * memory using 2MB pages.  This replaces some of the PD entries
-	 * created above.
+	 * Map from start of the kernel in physical memory (staging
+	 * area) to the end of loader preallocated memory using 2MB
+	 * pages.  This replaces some of the PD entries created above.
+	 * For compatibility, identity map 2M at the start.
 	 */
-	for (i = 0; (i << PDRSHIFT) < KERNend; i++)
+	pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A |
+	    X86_PG_RW | pg_nx;
+	for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) {
 		/* Preset PG_M and PG_A because demotion expects it. */
-		pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
-		    X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
+		pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
+		    X86_PG_A | bootaddr_rwx(pax);
+	}
 
 	/*
 	 * Because we map the physical blocks in 2M pages, adjust firstaddr
@@ -1735,15 +1742,18 @@ create_pagetables(vm_paddr_t *firstaddr)
 	 * use 2M pages with read-only and no-execute permissions.  (If using 1G
 	 * pages, this will partially overwrite the PDPEs above.)
 	 */
-	if (ndm1g) {
+	if (ndm1g > 0) {
 		pd_p = (pd_entry_t *)DMPDkernphys;
-		for (i = 0; i < (NPDEPG * nkdmpde); i++)
-			pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
-			    X86_PG_M | X86_PG_A | pg_nx |
-			    bootaddr_rwx(i << PDRSHIFT);
-		for (i = 0; i < nkdmpde; i++)
-			pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
-			    X86_PG_V | pg_nx;
+		for (i = 0, pax = rounddown2(kernphys, NBPDP);
+		    i < NPDEPG * nkdmpde; i++, pax += NBPDR) {
+			pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
+			    X86_PG_A | pg_nx | bootaddr_rwx(pax);
+		}
+		j = rounddown2(kernphys, NBPDP) >> PDPSHIFT;
+		for (i = 0; i < nkdmpde; i++) {
+			pdp_p[i + j] = (DMPDkernphys + ptoa(i)) |
+			    X86_PG_RW | X86_PG_V | pg_nx;
+		}
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
@@ -1811,7 +1821,8 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	/*
 	 * Account for the virtual addresses mapped by create_pagetables().
 	 */
-	virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
+	virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend -
+	    (vm_paddr_t)kernphys);
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
@@ -2348,7 +2359,8 @@ pmap_init(void)
 		 * Collect the page table pages that were replaced by a 2MB
 		 * page in create_pagetables().  They are zero filled.
 		 */
-		if ((vm_paddr_t)i << PDRSHIFT < KERNend &&
+		if ((i == 0 ||
+		    kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) &&
 		    pmap_insert_pt_page(kernel_pmap, mpte, false))
 			panic("pmap_init: pmap_insert_pt_page failed");
 	}
@@ -6567,7 +6579,9 @@ setpte:
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_pde: page table page is out of range"));
 	KASSERT(mpte->pindex == pmap_pde_pindex(va),
-	    ("pmap_promote_pde: page table page's pindex is wrong"));
+	    ("pmap_promote_pde: page table page's pindex is wrong "
+	    "mpte %p pidx %#lx va %#lx va pde pidx %#lx",
+	    mpte, mpte->pindex, va, pmap_pde_pindex(va)));
 	if (pmap_insert_pt_page(pmap, mpte, true)) {
 		atomic_add_long(&pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP,
@@ -10625,8 +10639,8 @@ pmap_pti_init(void)
 		va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false);
 	}
-	pmap_pti_add_kva_locked((vm_offset_t)KERNBASE + NBPDR,
-	    (vm_offset_t)etext, true);
+	pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext,
+	    true);
 	pti_finalized = true;
 	VM_OBJECT_WUNLOCK(pti_obj);
 }
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index b66e314d99b1..53139711bbff 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -49,11 +49,8 @@ extern vm_paddr_t intel_graphics_stolen_size;
 
 extern int la57;
 
-/*
- * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its
- * value is the physical address at which the kernel is loaded.
- */
-extern char kernphys[];
+extern vm_paddr_t kernphys;
+extern vm_paddr_t KERNend;
 
 extern bool efi_boot;
 
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index c7ffb218dd4a..b6f79ef8ca84 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -149,8 +149,10 @@
 #endif
 
 /*
- * Kernel physical load address. Needs to be aligned at 2MB superpage
- * boundary.
+ * Kernel physical load address for non-UEFI boot and for legacy UEFI loader.
+ * Newer UEFI loader loads kernel anywhere below 4G, with memory allocated
+ * by boot services.
+ * Needs to be aligned at 2MB superpage boundary.
  */
 #ifndef KERNLOAD
 #define	KERNLOAD	0x200000
@@ -186,7 +188,17 @@
 #define	LARGEMAP_MIN_ADDRESS	KV4ADDR(LMSPML4I, 0, 0, 0)
 #define	LARGEMAP_MAX_ADDRESS	KV4ADDR(LMEPML4I + 1, 0, 0, 0)
 
+/*
+ * Formally kernel mapping starts at KERNBASE, but kernel linker
+ * script leaves first PDE reserved.  For legacy BIOS boot, kernel is
+ * loaded at KERNLOAD = 2M, and initial kernel page table maps
+ * physical memory from zero to KERNend starting at KERNBASE.
+ *
+ * KERNSTART is where the first actual kernel page is mapped, after
+ * the compatibility mapping.
+ */
 #define	KERNBASE		KV4ADDR(KPML4I, KPDPI, 0, 0)
+#define	KERNSTART		(KERNBASE + NBPDR)
 
 #define	UPT_MAX_ADDRESS		KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)
 #define	UPT_MIN_ADDRESS		KV4ADDR(PML4PML4I, 0, 0, 0)
diff --git a/sys/conf/ldscript.amd64 b/sys/conf/ldscript.amd64
index c11ffb6ea49f..68085ff7435c 100644
--- a/sys/conf/ldscript.amd64
+++ b/sys/conf/ldscript.amd64
@@ -5,15 +5,14 @@ ENTRY(btext)
 SEARCH_DIR("/usr/lib");
 SECTIONS
 {
-  kernphys = kernload;
   /* Read-only sections, merged into text segment: */
-  . = kernbase + kernphys + SIZEOF_HEADERS;
+  . = kernbase + kernload + SIZEOF_HEADERS;
   /*
    * Use the AT keyword in order to set the right LMA that contains
    * the physical address where the section should be loaded. This is
    * needed for the Xen loader which honours the LMA.
    */
-  .interp         : AT (kernphys + SIZEOF_HEADERS) { *(.interp) }
+  .interp         : AT (kernload + SIZEOF_HEADERS) { *(.interp) }
   .hash           : { *(.hash) }
   .gnu.hash       : { *(.gnu.hash) }
   .dynsym         : { *(.dynsym) }


More information about the dev-commits-src-all mailing list