git: e18380e34141 - main - amd64: do not assume that kernel is loaded at 2M physical

Konstantin Belousov kib at FreeBSD.org
Sat Jul 31 13:54:13 UTC 2021


The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=e18380e341410ce70d97560a22827591f4b2d373

commit e18380e341410ce70d97560a22827591f4b2d373
Author:     Konstantin Belousov <kib at FreeBSD.org>
AuthorDate: 2021-07-10 19:48:02 +0000
Commit:     Konstantin Belousov <kib at FreeBSD.org>
CommitDate: 2021-07-31 13:53:42 +0000

    amd64: do not assume that kernel is loaded at 2M physical
    
    Allow any 2M aligned contiguous location below 4G for the staging
    area location.  It should still be mapped by loader at KERNBASE.
    
    The assumption kernel makes about loader->kernel handoff with regard to
    the MMU programming are explicitly listed at the beginning of hammer_time(),
    where kernphys is calculated.  Now kernphys is the variable instead of
    symbol designating the physical address.
    
    Reviewed by:    markj
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D31121
---
 sys/amd64/amd64/machdep.c   | 38 ++++++++++++++++++++++--
 sys/amd64/amd64/pmap.c      | 70 +++++++++++++++++++++++++++------------------
 sys/amd64/include/md_var.h  |  7 ++---
 sys/amd64/include/vmparam.h | 16 +++++++++--
 sys/conf/ldscript.amd64     |  5 ++--
 5 files changed, 95 insertions(+), 41 deletions(-)

diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 0cb72314e68d..8599dc2fa8f6 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1590,7 +1590,10 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 	int gsel_tss, x;
 	struct pcpu *pc;
 	struct xstate_hdr *xhdr;
-	u_int64_t rsp0;
+	uint64_t cr3, rsp0;
+	pml4_entry_t *pml4e;
+	pdp_entry_t *pdpe;
+	pd_entry_t *pde;
 	char *env;
 	struct user_segment_descriptor *gdt;
 	struct region_descriptor r_gdt;
@@ -1599,6 +1602,35 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 
 	TSRAW(&thread0, TS_ENTER, __func__, NULL);
 
+	/*
+	 * Calculate kernphys by inspecting page table created by loader.
+	 * The assumptions:
+	 * - kernel is mapped at KERNBASE, backed by contiguous phys memory
+	 *   aligned at 2M, below 4G (the latter is important for AP startup)
+	 * - there is a 2M hole at KERNBASE
+	 * - kernel is mapped with 2M superpages
+	 * - all participating memory, i.e. kernel, modules, metadata,
+	 *   page table is accessible by pre-created 1:1 mapping
+	 *   (right now loader creates 1:1 mapping for lower 4G, and all
+	 *   memory is from there)
+	 * - there is a usable memory block right after the end of the
+	 *   mapped kernel and all modules/metadata, pointed to by
+	 *   physfree, for early allocations
+	 */
+	cr3 = rcr3();
+	pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
+	    (vm_offset_t)hammer_time);
+	pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
+	    (vm_offset_t)hammer_time);
+	pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
+	    (vm_offset_t)hammer_time);
+	kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
+	    (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
+
+	/* Fix-up for 2M hole */
+	physfree += kernphys;
+	kernphys += NBPDR;
+
 	kmdp = init_ops.parse_preload_data(modulep);
 
 	efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
@@ -1644,7 +1676,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 	/* Init basic tunables, hz etc */
 	init_param1();
 
-	thread0.td_kstack = physfree + KERNBASE;
+	thread0.td_kstack = physfree - kernphys + KERNSTART;
 	thread0.td_kstack_pages = kstack_pages;
 	kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
 	bzero((void *)thread0.td_kstack, kstack0_sz);
@@ -1681,7 +1713,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
 	wrmsr(MSR_GSBASE, (u_int64_t)pc);
 	wrmsr(MSR_KGSBASE, 0);		/* User value while in the kernel */
 
-	dpcpu_init((void *)(physfree + KERNBASE), 0);
+	dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
 	physfree += DPCPU_SIZE;
 	amd64_bsp_pcpu_init1(pc);
 	/* Non-late cninit() and printf() can be moved up to here. */
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 47315c560831..bc203990faa1 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -436,7 +436,8 @@ static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 static int		ndmpdpphys;	/* number of DMPDPphys pages */
 
-static vm_paddr_t	KERNend;	/* phys addr of end of bootstrap data */
+vm_paddr_t		kernphys;	/* phys addr of start of bootstrap data */
+vm_paddr_t		KERNend;	/* and the end */
 
 /*
  * pmap_mapdev support pre initialization (i.e. console)
@@ -1554,7 +1555,7 @@ nkpt_init(vm_paddr_t addr)
 #ifdef NKPT
 	pt_pages = NKPT;
 #else
-	pt_pages = howmany(addr, NBPDR);
+	pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */
 	pt_pages += NKPDPE(pt_pages);
 
 	/*
@@ -1594,7 +1595,6 @@ nkpt_init(vm_paddr_t addr)
 static inline pt_entry_t
 bootaddr_rwx(vm_paddr_t pa)
 {
-
 	/*
 	 * The kernel is loaded at a 2MB-aligned address, and memory below that
 	 * need not be executable.  The .bss section is padded to a 2MB
@@ -1602,8 +1602,8 @@ bootaddr_rwx(vm_paddr_t pa)
 	 * either.  Preloaded kernel modules have their mapping permissions
 	 * fixed up by the linker.
 	 */
-	if (pa < trunc_2mpage(btext - KERNBASE) ||
-	    pa >= trunc_2mpage(_end - KERNBASE))
+	if (pa < trunc_2mpage(kernphys + btext - KERNSTART) ||
+	    pa >= trunc_2mpage(kernphys + _end - KERNSTART))
 		return (X86_PG_RW | pg_nx);
 
 	/*
@@ -1612,7 +1612,7 @@ bootaddr_rwx(vm_paddr_t pa)
 	 * impact read-only data. However, in any case, any page with
 	 * read-write data needs to be read-write.
 	 */
-	if (pa >= trunc_2mpage(brwsection - KERNBASE))
+	if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART))
 		return (X86_PG_RW | pg_nx);
 
 	/*
@@ -1624,7 +1624,7 @@ bootaddr_rwx(vm_paddr_t pa)
 	 * Note that fixups to the .text section will still work until we
 	 * set CR0.WP.
 	 */
-	if (pa < round_2mpage(etext - KERNBASE))
+	if (pa < round_2mpage(kernphys + etext - KERNSTART))
 		return (0);
 	return (pg_nx);
 }
@@ -1636,6 +1636,7 @@ create_pagetables(vm_paddr_t *firstaddr)
 	pdp_entry_t *pdp_p;
 	pml4_entry_t *p4_p;
 	uint64_t DMPDkernphys;
+	vm_paddr_t pax;
 #ifdef KASAN
 	pt_entry_t *pt_p;
 	uint64_t KASANPDphys, KASANPTphys, KASANphys;
@@ -1670,9 +1671,11 @@ create_pagetables(vm_paddr_t *firstaddr)
 
 		/*
 		 * Allocate 2M pages for the kernel. These will be used in
-		 * place of the first one or more 1G pages from ndm1g.
+		 * place of the one or more 1G pages from ndm1g that maps
+		 * kernel memory into DMAP.
 		 */
-		nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
+		nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART +
+		    kernphys - rounddown2(kernphys, NBPDP), NBPDP);
 		DMPDkernphys = allocpages(firstaddr, nkdmpde);
 	}
 	if (ndm1g < ndmpdp)
@@ -1719,14 +1722,18 @@ create_pagetables(vm_paddr_t *firstaddr)
 		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
 	/*
-	 * Map from physical address zero to the end of loader preallocated
-	 * memory using 2MB pages.  This replaces some of the PD entries
-	 * created above.
+	 * Map from start of the kernel in physical memory (staging
+	 * area) to the end of loader preallocated memory using 2MB
+	 * pages.  This replaces some of the PD entries created above.
+	 * For compatibility, identity map 2M at the start.
 	 */
-	for (i = 0; (i << PDRSHIFT) < KERNend; i++)
+	pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A |
+	    X86_PG_RW | pg_nx;
+	for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) {
 		/* Preset PG_M and PG_A because demotion expects it. */
-		pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
-		    X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
+		pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
+		    X86_PG_A | bootaddr_rwx(pax);
+	}
 
 	/*
 	 * Because we map the physical blocks in 2M pages, adjust firstaddr
@@ -1792,15 +1799,18 @@ create_pagetables(vm_paddr_t *firstaddr)
 	 * use 2M pages with read-only and no-execute permissions.  (If using 1G
 	 * pages, this will partially overwrite the PDPEs above.)
 	 */
-	if (ndm1g) {
+	if (ndm1g > 0) {
 		pd_p = (pd_entry_t *)DMPDkernphys;
-		for (i = 0; i < (NPDEPG * nkdmpde); i++)
-			pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
-			    X86_PG_M | X86_PG_A | pg_nx |
-			    bootaddr_rwx(i << PDRSHIFT);
-		for (i = 0; i < nkdmpde; i++)
-			pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
-			    X86_PG_V | pg_nx;
+		for (i = 0, pax = rounddown2(kernphys, NBPDP);
+		    i < NPDEPG * nkdmpde; i++, pax += NBPDR) {
+			pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
+			    X86_PG_A | pg_nx | bootaddr_rwx(pax);
+		}
+		j = rounddown2(kernphys, NBPDP) >> PDPSHIFT;
+		for (i = 0; i < nkdmpde; i++) {
+			pdp_p[i + j] = (DMPDkernphys + ptoa(i)) |
+			    X86_PG_RW | X86_PG_V | pg_nx;
+		}
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
@@ -1876,7 +1886,8 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	/*
 	 * Account for the virtual addresses mapped by create_pagetables().
 	 */
-	virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
+	virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend -
+	    (vm_paddr_t)kernphys);
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
@@ -2414,7 +2425,8 @@ pmap_init(void)
 		 * Collect the page table pages that were replaced by a 2MB
 		 * page in create_pagetables().  They are zero filled.
 		 */
-		if ((vm_paddr_t)i << PDRSHIFT < KERNend &&
+		if ((i == 0 ||
+		    kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) &&
 		    pmap_insert_pt_page(kernel_pmap, mpte, false))
 			panic("pmap_init: pmap_insert_pt_page failed");
 	}
@@ -6681,7 +6693,9 @@ setpte:
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_pde: page table page is out of range"));
 	KASSERT(mpte->pindex == pmap_pde_pindex(va),
-	    ("pmap_promote_pde: page table page's pindex is wrong"));
+	    ("pmap_promote_pde: page table page's pindex is wrong "
+	    "mpte %p pidx %#lx va %#lx va pde pidx %#lx",
+	    mpte, mpte->pindex, va, pmap_pde_pindex(va)));
 	if (pmap_insert_pt_page(pmap, mpte, true)) {
 		counter_u64_add(pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP,
@@ -10751,8 +10765,8 @@ pmap_pti_init(void)
 		va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false);
 	}
-	pmap_pti_add_kva_locked((vm_offset_t)KERNBASE + NBPDR,
-	    (vm_offset_t)etext, true);
+	pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext,
+	    true);
 	pti_finalized = true;
 	VM_OBJECT_WUNLOCK(pti_obj);
 }
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index b66e314d99b1..53139711bbff 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -49,11 +49,8 @@ extern vm_paddr_t intel_graphics_stolen_size;
 
 extern int la57;
 
-/*
- * The file "conf/ldscript.amd64" defines the symbol "kernphys".  Its
- * value is the physical address at which the kernel is loaded.
- */
-extern char kernphys[];
+extern vm_paddr_t kernphys;
+extern vm_paddr_t KERNend;
 
 extern bool efi_boot;
 
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index 6dd76063bf3f..88fd29b80be3 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -151,8 +151,10 @@
 #endif
 
 /*
- * Kernel physical load address. Needs to be aligned at 2MB superpage
- * boundary.
+ * Kernel physical load address for non-UEFI boot and for legacy UEFI loader.
+ * Newer UEFI loader loads kernel anywhere below 4G, with memory allocated
+ * by boot services.
+ * Needs to be aligned at 2MB superpage boundary.
  */
 #ifndef KERNLOAD
 #define	KERNLOAD	0x200000
@@ -192,7 +194,17 @@
 #define	LARGEMAP_MIN_ADDRESS	KV4ADDR(LMSPML4I, 0, 0, 0)
 #define	LARGEMAP_MAX_ADDRESS	KV4ADDR(LMEPML4I + 1, 0, 0, 0)
 
+/*
+ * Formally kernel mapping starts at KERNBASE, but kernel linker
+ * script leaves first PDE reserved.  For legacy BIOS boot, kernel is
+ * loaded at KERNLOAD = 2M, and initial kernel page table maps
+ * physical memory from zero to KERNend starting at KERNBASE.
+ *
+ * KERNSTART is where the first actual kernel page is mapped, after
+ * the compatibility mapping.
+ */
 #define	KERNBASE		KV4ADDR(KPML4I, KPDPI, 0, 0)
+#define	KERNSTART		(KERNBASE + NBPDR)
 
 #define	UPT_MAX_ADDRESS		KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)
 #define	UPT_MIN_ADDRESS		KV4ADDR(PML4PML4I, 0, 0, 0)
diff --git a/sys/conf/ldscript.amd64 b/sys/conf/ldscript.amd64
index c11ffb6ea49f..68085ff7435c 100644
--- a/sys/conf/ldscript.amd64
+++ b/sys/conf/ldscript.amd64
@@ -5,15 +5,14 @@ ENTRY(btext)
 SEARCH_DIR("/usr/lib");
 SECTIONS
 {
-  kernphys = kernload;
   /* Read-only sections, merged into text segment: */
-  . = kernbase + kernphys + SIZEOF_HEADERS;
+  . = kernbase + kernload + SIZEOF_HEADERS;
   /*
    * Use the AT keyword in order to set the right LMA that contains
    * the physical address where the section should be loaded. This is
    * needed for the Xen loader which honours the LMA.
    */
-  .interp         : AT (kernphys + SIZEOF_HEADERS) { *(.interp) }
+  .interp         : AT (kernload + SIZEOF_HEADERS) { *(.interp) }
   .hash           : { *(.hash) }
   .gnu.hash       : { *(.gnu.hash) }
   .dynsym         : { *(.dynsym) }


More information about the dev-commits-src-main mailing list