git: af1c6d3f3013 - main - amd64: do not leak pcpu pages

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Thu, 04 May 2023 15:40:14 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=af1c6d3f3013062370692c8e1e9c87bb138fbbd9

commit af1c6d3f3013062370692c8e1e9c87bb138fbbd9
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2023-05-03 09:41:46 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2023-05-04 15:39:22 +0000

    amd64: do not leak pcpu pages
    
    Do not preallocate pcpu area backing pages on early startup, only
    allocate enough of KVA for pcpu[MAXCPU] and the page for BSP.  Other
    pages are allocated after we know the number of cpus and their
    assignments to the domains.
    
    PCPUs are not accessed until they are initialized, which happens on AP
    startup.
    
    Reviewed by:    markj
    Sponsored by:   The FreeBSD Foundation
    Differential revision:  https://reviews.freebsd.org/D39945
---
 sys/amd64/amd64/mp_machdep.c | 52 ++++++++++++++++++++------------------------
 sys/amd64/amd64/pmap.c       | 17 ++++++++++-----
 2 files changed, 34 insertions(+), 35 deletions(-)

diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index f6c3446e9981..5fdde0bb887d 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -290,29 +290,32 @@ init_secondary(void)
 	init_secondary_tail();
 }
 
-/*******************************************************************
- * local functions and data
- */
-
-#ifdef NUMA
 static void
-mp_realloc_pcpu(int cpuid, int domain)
+amd64_mp_alloc_pcpu(void)
 {
 	vm_page_t m;
-	vm_offset_t oa, na;
-
-	oa = (vm_offset_t)&__pcpu[cpuid];
-	if (vm_phys_domain(pmap_kextract(oa)) == domain)
-		return;
-	m = vm_page_alloc_noobj_domain(domain, 0);
-	if (m == NULL)
-		return;
-	na = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
-	pagecopy((void *)oa, (void *)na);
-	pmap_qenter((vm_offset_t)&__pcpu[cpuid], &m, 1);
-	/* XXX old pcpu page leaked. */
-}
+	int cpu;
+
+	/* Allocate pcpu areas to the correct domain. */
+	for (cpu = 1; cpu < mp_ncpus; cpu++) {
+#ifdef NUMA
+		m = NULL;
+		if (vm_ndomains > 1) {
+			m = vm_page_alloc_noobj_domain(
+			    acpi_pxm_get_cpu_locality(cpu_apic_ids[cpu]), 0);
+		}
+		if (m == NULL)
 #endif
+			m = vm_page_alloc_noobj(0);
+		if (m == NULL)
+			panic("cannot alloc pcpu page for cpu %d", cpu);
+		pmap_qenter((vm_offset_t)&__pcpu[cpu], &m, 1);
+	}
+}
+
+/*******************************************************************
+ * local functions and data
+ */
 
 /*
  * start each AP in our list
@@ -330,6 +333,7 @@ start_all_aps(void)
 	int apic_id, cpu, domain, i;
 	u_char mpbiosreason;
 
+	amd64_mp_alloc_pcpu();
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
 	MPASS(bootMP_size <= PAGE_SIZE);
@@ -403,16 +407,6 @@ start_all_aps(void)
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, BIOS_WARM);	/* 'warm-start' */
 
-	/* Relocate pcpu areas to the correct domain. */
-#ifdef NUMA
-	if (vm_ndomains > 1)
-		for (cpu = 1; cpu < mp_ncpus; cpu++) {
-			apic_id = cpu_apic_ids[cpu];
-			domain = acpi_pxm_get_cpu_locality(apic_id);
-			mp_realloc_pcpu(cpu, domain);
-		}
-#endif
-
 	/* start each AP */
 	domain = 0;
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
diff --git a/sys/amd64/amd64/pmap.c b/sys/amd64/amd64/pmap.c
index 55086125fbb9..1009736472dc 100644
--- a/sys/amd64/amd64/pmap.c
+++ b/sys/amd64/amd64/pmap.c
@@ -1902,7 +1902,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	vm_offset_t va;
 	pt_entry_t *pte, *pcpu_pte;
 	struct region_descriptor r_gdt;
-	uint64_t cr4, pcpu_phys;
+	uint64_t cr4, pcpu0_phys;
 	u_long res;
 	int i;
 
@@ -1917,7 +1917,7 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	 */
 	create_pagetables(firstaddr);
 
-	pcpu_phys = allocpages(firstaddr, MAXCPU);
+	pcpu0_phys = allocpages(firstaddr, 1);
 
 	/*
 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
@@ -1995,10 +1995,15 @@ pmap_bootstrap(vm_paddr_t *firstaddr)
 	SYSMAP(struct pcpu *, pcpu_pte, __pcpu, MAXCPU);
 	virtual_avail = va;
 
-	for (i = 0; i < MAXCPU; i++) {
-		pcpu_pte[i] = (pcpu_phys + ptoa(i)) | X86_PG_V | X86_PG_RW |
-		    pg_g | pg_nx | X86_PG_M | X86_PG_A;
-	}
+	/*
+	 * Map the BSP PCPU now, the rest of the PCPUs are mapped by
+	 * amd64_mp_alloc_pcpu()/start_all_aps() when we know the
+	 * number of CPUs and NUMA affinity.
+	 */
+	pcpu_pte[0] = pcpu0_phys | X86_PG_V | X86_PG_RW | pg_g | pg_nx |
+	    X86_PG_M | X86_PG_A;
+	for (i = 1; i < MAXCPU; i++)
+		pcpu_pte[i] = 0;
 
 	/*
 	 * Re-initialize PCPU area for BSP after switching.