git: 8ca493ffb446 - stable/13 - amd64: do not assume that kernel is loaded at 2M physical
Konstantin Belousov
kib at FreeBSD.org
Mon Aug 23 23:22:22 UTC 2021
The branch stable/13 has been updated by kib:
URL: https://cgit.FreeBSD.org/src/commit/?id=8ca493ffb44691e70ae92300b8de1c1d30134ef4
commit 8ca493ffb44691e70ae92300b8de1c1d30134ef4
Author: Konstantin Belousov <kib at FreeBSD.org>
AuthorDate: 2021-07-10 19:48:02 +0000
Commit: Konstantin Belousov <kib at FreeBSD.org>
CommitDate: 2021-08-23 23:21:13 +0000
amd64: do not assume that kernel is loaded at 2M physical
(cherry picked from commit e18380e341410ce70d97560a22827591f4b2d373)
---
sys/amd64/amd64/machdep.c | 38 ++++++++++++++++++++++--
sys/amd64/amd64/pmap.c | 72 +++++++++++++++++++++++++++------------------
sys/amd64/include/md_var.h | 7 ++---
sys/amd64/include/vmparam.h | 16 ++++++++--
sys/conf/ldscript.amd64 | 5 ++--
5 files changed, 96 insertions(+), 42 deletions(-)
diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 63f933ad535c..2c8711fd3d2a 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -1599,7 +1599,10 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
int gsel_tss, x;
struct pcpu *pc;
struct xstate_hdr *xhdr;
- u_int64_t rsp0;
+ uint64_t cr3, rsp0;
+ pml4_entry_t *pml4e;
+ pdp_entry_t *pdpe;
+ pd_entry_t *pde;
char *env;
struct user_segment_descriptor *gdt;
struct region_descriptor r_gdt;
@@ -1608,6 +1611,35 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
TSRAW(&thread0, TS_ENTER, __func__, NULL);
+ /*
+ * Calculate kernphys by inspecting page table created by loader.
+ * The assumptions:
+ * - kernel is mapped at KERNBASE, backed by contiguous phys memory
+ * aligned at 2M, below 4G (the latter is important for AP startup)
+ * - there is a 2M hole at KERNBASE
+ * - kernel is mapped with 2M superpages
+ * - all participating memory, i.e. kernel, modules, metadata,
+ * page table is accessible by pre-created 1:1 mapping
+ * (right now loader creates 1:1 mapping for lower 4G, and all
+ * memory is from there)
+ * - there is a usable memory block right after the end of the
+ * mapped kernel and all modules/metadata, pointed to by
+ * physfree, for early allocations
+ */
+ cr3 = rcr3();
+ pml4e = (pml4_entry_t *)(cr3 & ~PAGE_MASK) + pmap_pml4e_index(
+ (vm_offset_t)hammer_time);
+ pdpe = (pdp_entry_t *)(*pml4e & ~PAGE_MASK) + pmap_pdpe_index(
+ (vm_offset_t)hammer_time);
+ pde = (pd_entry_t *)(*pdpe & ~PAGE_MASK) + pmap_pde_index(
+ (vm_offset_t)hammer_time);
+ kernphys = (vm_paddr_t)(*pde & ~PDRMASK) -
+ (vm_paddr_t)(((vm_offset_t)hammer_time - KERNBASE) & ~PDRMASK);
+
+ /* Fix-up for 2M hole */
+ physfree += kernphys;
+ kernphys += NBPDR;
+
kmdp = init_ops.parse_preload_data(modulep);
efi_boot = preload_search_info(kmdp, MODINFO_METADATA |
@@ -1653,7 +1685,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
/* Init basic tunables, hz etc */
init_param1();
- thread0.td_kstack = physfree + KERNBASE;
+ thread0.td_kstack = physfree - kernphys + KERNSTART;
thread0.td_kstack_pages = kstack_pages;
kstack0_sz = thread0.td_kstack_pages * PAGE_SIZE;
bzero((void *)thread0.td_kstack, kstack0_sz);
@@ -1690,7 +1722,7 @@ hammer_time(u_int64_t modulep, u_int64_t physfree)
wrmsr(MSR_GSBASE, (u_int64_t)pc);
wrmsr(MSR_KGSBASE, 0); /* User value while in the kernel */
- dpcpu_init((void *)(physfree + KERNBASE), 0);
+ dpcpu_init((void *)(physfree - kernphys + KERNSTART), 0);
physfree += DPCPU_SIZE;
amd64_bsp_pcpu_init1(pc);
/* Non-late cninit() and printf() can be moved up to here. */
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index e5d46449c275..d35422924b1f 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -429,7 +429,8 @@ static u_int64_t DMPDphys; /* phys addr of direct mapped level 2 */
static u_int64_t DMPDPphys; /* phys addr of direct mapped level 3 */
static int ndmpdpphys; /* number of DMPDPphys pages */
-static vm_paddr_t KERNend; /* phys addr of end of bootstrap data */
+vm_paddr_t kernphys; /* phys addr of start of bootstrap data */
+vm_paddr_t KERNend; /* and the end */
/*
* pmap_mapdev support pre initialization (i.e. console)
@@ -1532,7 +1533,7 @@ nkpt_init(vm_paddr_t addr)
#ifdef NKPT
pt_pages = NKPT;
#else
- pt_pages = howmany(addr, NBPDR);
+ pt_pages = howmany(addr - kernphys, NBPDR) + 1; /* +1 for 2M hole @0 */
pt_pages += NKPDPE(pt_pages);
/*
@@ -1572,7 +1573,6 @@ nkpt_init(vm_paddr_t addr)
static inline pt_entry_t
bootaddr_rwx(vm_paddr_t pa)
{
-
/*
* The kernel is loaded at a 2MB-aligned address, and memory below that
* need not be executable. The .bss section is padded to a 2MB
@@ -1580,8 +1580,8 @@ bootaddr_rwx(vm_paddr_t pa)
* either. Preloaded kernel modules have their mapping permissions
* fixed up by the linker.
*/
- if (pa < trunc_2mpage(btext - KERNBASE) ||
- pa >= trunc_2mpage(_end - KERNBASE))
+ if (pa < trunc_2mpage(kernphys + btext - KERNSTART) ||
+ pa >= trunc_2mpage(kernphys + _end - KERNSTART))
return (X86_PG_RW | pg_nx);
/*
@@ -1590,7 +1590,7 @@ bootaddr_rwx(vm_paddr_t pa)
* impact read-only data. However, in any case, any page with
* read-write data needs to be read-write.
*/
- if (pa >= trunc_2mpage(brwsection - KERNBASE))
+ if (pa >= trunc_2mpage(kernphys + brwsection - KERNSTART))
return (X86_PG_RW | pg_nx);
/*
@@ -1602,7 +1602,7 @@ bootaddr_rwx(vm_paddr_t pa)
* Note that fixups to the .text section will still work until we
* set CR0.WP.
*/
- if (pa < round_2mpage(etext - KERNBASE))
+ if (pa < round_2mpage(kernphys + etext - KERNSTART))
return (0);
return (pg_nx);
}
@@ -1610,11 +1610,12 @@ bootaddr_rwx(vm_paddr_t pa)
static void
create_pagetables(vm_paddr_t *firstaddr)
{
- int i, j, ndm1g, nkpdpe, nkdmpde;
pd_entry_t *pd_p;
pdp_entry_t *pdp_p;
pml4_entry_t *p4_p;
uint64_t DMPDkernphys;
+ vm_paddr_t pax;
+ int i, j, ndm1g, nkpdpe, nkdmpde;
/* Allocate page table pages for the direct map */
ndmpdp = howmany(ptoa(Maxmem), NBPDP);
@@ -1642,9 +1643,11 @@ create_pagetables(vm_paddr_t *firstaddr)
/*
* Allocate 2M pages for the kernel. These will be used in
- * place of the first one or more 1G pages from ndm1g.
+ * place of the one or more 1G pages from ndm1g that maps
+ * kernel memory into DMAP.
*/
- nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
+ nkdmpde = howmany((vm_offset_t)brwsection - KERNSTART +
+ kernphys - rounddown2(kernphys, NBPDP), NBPDP);
DMPDkernphys = allocpages(firstaddr, nkdmpde);
}
if (ndm1g < ndmpdp)
@@ -1681,14 +1684,18 @@ create_pagetables(vm_paddr_t *firstaddr)
pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
/*
- * Map from physical address zero to the end of loader preallocated
- * memory using 2MB pages. This replaces some of the PD entries
- * created above.
+ * Map from start of the kernel in physical memory (staging
+ * area) to the end of loader preallocated memory using 2MB
+ * pages. This replaces some of the PD entries created above.
+ * For compatibility, identity map 2M at the start.
*/
- for (i = 0; (i << PDRSHIFT) < KERNend; i++)
+ pd_p[0] = X86_PG_V | PG_PS | pg_g | X86_PG_M | X86_PG_A |
+ X86_PG_RW | pg_nx;
+ for (i = 1, pax = kernphys; pax < KERNend; i++, pax += NBPDR) {
/* Preset PG_M and PG_A because demotion expects it. */
- pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
- X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
+ pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
+ X86_PG_A | bootaddr_rwx(pax);
+ }
/*
* Because we map the physical blocks in 2M pages, adjust firstaddr
@@ -1735,15 +1742,18 @@ create_pagetables(vm_paddr_t *firstaddr)
* use 2M pages with read-only and no-execute permissions. (If using 1G
* pages, this will partially overwrite the PDPEs above.)
*/
- if (ndm1g) {
+ if (ndm1g > 0) {
pd_p = (pd_entry_t *)DMPDkernphys;
- for (i = 0; i < (NPDEPG * nkdmpde); i++)
- pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
- X86_PG_M | X86_PG_A | pg_nx |
- bootaddr_rwx(i << PDRSHIFT);
- for (i = 0; i < nkdmpde; i++)
- pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
- X86_PG_V | pg_nx;
+ for (i = 0, pax = rounddown2(kernphys, NBPDP);
+ i < NPDEPG * nkdmpde; i++, pax += NBPDR) {
+ pd_p[i] = pax | X86_PG_V | PG_PS | pg_g | X86_PG_M |
+ X86_PG_A | pg_nx | bootaddr_rwx(pax);
+ }
+ j = rounddown2(kernphys, NBPDP) >> PDPSHIFT;
+ for (i = 0; i < nkdmpde; i++) {
+ pdp_p[i + j] = (DMPDkernphys + ptoa(i)) |
+ X86_PG_RW | X86_PG_V | pg_nx;
+ }
}
/* And recursively map PML4 to itself in order to get PTmap */
@@ -1811,7 +1821,8 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
/*
* Account for the virtual addresses mapped by create_pagetables().
*/
- virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
+ virtual_avail = (vm_offset_t)KERNSTART + round_2mpage(KERNend -
+ (vm_paddr_t)kernphys);
virtual_end = VM_MAX_KERNEL_ADDRESS;
/*
@@ -2348,7 +2359,8 @@ pmap_init(void)
* Collect the page table pages that were replaced by a 2MB
* page in create_pagetables(). They are zero filled.
*/
- if ((vm_paddr_t)i << PDRSHIFT < KERNend &&
+ if ((i == 0 ||
+ kernphys + ((vm_paddr_t)(i - 1) << PDRSHIFT) < KERNend) &&
pmap_insert_pt_page(kernel_pmap, mpte, false))
panic("pmap_init: pmap_insert_pt_page failed");
}
@@ -6567,7 +6579,9 @@ setpte:
mpte < &vm_page_array[vm_page_array_size],
("pmap_promote_pde: page table page is out of range"));
KASSERT(mpte->pindex == pmap_pde_pindex(va),
- ("pmap_promote_pde: page table page's pindex is wrong"));
+ ("pmap_promote_pde: page table page's pindex is wrong "
+ "mpte %p pidx %#lx va %#lx va pde pidx %#lx",
+ mpte, mpte->pindex, va, pmap_pde_pindex(va)));
if (pmap_insert_pt_page(pmap, mpte, true)) {
atomic_add_long(&pmap_pde_p_failures, 1);
CTR2(KTR_PMAP,
@@ -10625,8 +10639,8 @@ pmap_pti_init(void)
va = __pcpu[i].pc_common_tss.tss_ist4 + sizeof(struct nmi_pcpu);
pmap_pti_add_kva_locked(va - DBG_STACK_SIZE, va, false);
}
- pmap_pti_add_kva_locked((vm_offset_t)KERNBASE + NBPDR,
- (vm_offset_t)etext, true);
+ pmap_pti_add_kva_locked((vm_offset_t)KERNSTART, (vm_offset_t)etext,
+ true);
pti_finalized = true;
VM_OBJECT_WUNLOCK(pti_obj);
}
diff --git a/sys/amd64/include/md_var.h b/sys/amd64/include/md_var.h
index b66e314d99b1..53139711bbff 100644
--- a/sys/amd64/include/md_var.h
+++ b/sys/amd64/include/md_var.h
@@ -49,11 +49,8 @@ extern vm_paddr_t intel_graphics_stolen_size;
extern int la57;
-/*
- * The file "conf/ldscript.amd64" defines the symbol "kernphys". Its
- * value is the physical address at which the kernel is loaded.
- */
-extern char kernphys[];
+extern vm_paddr_t kernphys;
+extern vm_paddr_t KERNend;
extern bool efi_boot;
diff --git a/sys/amd64/include/vmparam.h b/sys/amd64/include/vmparam.h
index c7ffb218dd4a..b6f79ef8ca84 100644
--- a/sys/amd64/include/vmparam.h
+++ b/sys/amd64/include/vmparam.h
@@ -149,8 +149,10 @@
#endif
/*
- * Kernel physical load address. Needs to be aligned at 2MB superpage
- * boundary.
+ * Kernel physical load address for non-UEFI boot and for legacy UEFI loader.
+ * Newer UEFI loader loads kernel anywhere below 4G, with memory allocated
+ * by boot services.
+ * Needs to be aligned at 2MB superpage boundary.
*/
#ifndef KERNLOAD
#define KERNLOAD 0x200000
@@ -186,7 +188,17 @@
#define LARGEMAP_MIN_ADDRESS KV4ADDR(LMSPML4I, 0, 0, 0)
#define LARGEMAP_MAX_ADDRESS KV4ADDR(LMEPML4I + 1, 0, 0, 0)
+/*
+ * Formally kernel mapping starts at KERNBASE, but kernel linker
+ * script leaves first PDE reserved. For legacy BIOS boot, kernel is
+ * loaded at KERNLOAD = 2M, and initial kernel page table maps
+ * physical memory from zero to KERNend starting at KERNBASE.
+ *
+ * KERNSTART is where the first actual kernel page is mapped, after
+ * the compatibility mapping.
+ */
#define KERNBASE KV4ADDR(KPML4I, KPDPI, 0, 0)
+#define KERNSTART (KERNBASE + NBPDR)
#define UPT_MAX_ADDRESS KV4ADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)
#define UPT_MIN_ADDRESS KV4ADDR(PML4PML4I, 0, 0, 0)
diff --git a/sys/conf/ldscript.amd64 b/sys/conf/ldscript.amd64
index c11ffb6ea49f..68085ff7435c 100644
--- a/sys/conf/ldscript.amd64
+++ b/sys/conf/ldscript.amd64
@@ -5,15 +5,14 @@ ENTRY(btext)
SEARCH_DIR("/usr/lib");
SECTIONS
{
- kernphys = kernload;
/* Read-only sections, merged into text segment: */
- . = kernbase + kernphys + SIZEOF_HEADERS;
+ . = kernbase + kernload + SIZEOF_HEADERS;
/*
* Use the AT keyword in order to set the right LMA that contains
* the physical address where the section should be loaded. This is
* needed for the Xen loader which honours the LMA.
*/
- .interp : AT (kernphys + SIZEOF_HEADERS) { *(.interp) }
+ .interp : AT (kernload + SIZEOF_HEADERS) { *(.interp) }
.hash : { *(.hash) }
.gnu.hash : { *(.gnu.hash) }
.dynsym : { *(.dynsym) }
More information about the dev-commits-src-all
mailing list