git: c946f699856f - stable/13 - amd64: rework AP startup

Konstantin Belousov kib at FreeBSD.org
Mon Aug 23 23:22:20 UTC 2021


The branch stable/13 has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=c946f699856f6737a5256d7c9f746ac8035339ee

commit c946f699856f6737a5256d7c9f746ac8035339ee
Author:     Konstantin Belousov <kib at FreeBSD.org>
AuthorDate: 2021-07-10 19:38:42 +0000
Commit:     Konstantin Belousov <kib at FreeBSD.org>
CommitDate: 2021-08-23 23:21:12 +0000

    amd64: rework AP startup
    
    (cherry picked from commit d6717f877872e62d9df1e0ce2d8856620c993924)
---
 sys/amd64/amd64/machdep.c    |   4 +-
 sys/amd64/amd64/mp_machdep.c | 187 ++++++++++++++++---------------------------
 sys/amd64/amd64/mpboot.S     |  64 +++++++--------
 sys/amd64/include/smp.h      |   3 +-
 sys/x86/x86/mp_x86.c         |   5 --
 sys/x86/xen/pv.c             |   1 -
 6 files changed, 96 insertions(+), 168 deletions(-)

diff --git a/sys/amd64/amd64/machdep.c b/sys/amd64/amd64/machdep.c
index 93030cbe7126..840570be534a 100644
--- a/sys/amd64/amd64/machdep.c
+++ b/sys/amd64/amd64/machdep.c
@@ -187,7 +187,6 @@ struct init_ops init_ops = {
 	.early_delay =			i8254_delay,
 	.parse_memmap =			native_parse_memmap,
 #ifdef SMP
-	.mp_bootaddress =		mp_bootaddress,
 	.start_all_aps =		native_start_all_aps,
 #endif
 #ifdef DEV_PCI
@@ -1288,8 +1287,7 @@ getmemsize(caddr_t kmdp, u_int64_t first)
 	 * is configured to support APs and APs for the system start
 	 * in real mode mode (e.g. SMP bare metal).
 	 */
-	if (init_ops.mp_bootaddress)
-		init_ops.mp_bootaddress(physmap, &physmap_idx);
+	alloc_ap_trampoline(physmap, &physmap_idx);
 
 	/* call pmap initialization to make new kernel address space */
 	pmap_bootstrap(&first);
diff --git a/sys/amd64/amd64/mp_machdep.c b/sys/amd64/amd64/mp_machdep.c
index d1064262891f..082a58ada48f 100644
--- a/sys/amd64/amd64/mp_machdep.c
+++ b/sys/amd64/amd64/mp_machdep.c
@@ -106,6 +106,7 @@ char *dbg_stack;
 void *bootpcpu;
 
 extern u_int mptramp_la57;
+extern u_int mptramp_nx;
 
 /*
  * Local data and functions.
@@ -113,86 +114,6 @@ extern u_int mptramp_la57;
 
 static int	start_ap(int apic_id);
 
-static bool
-is_kernel_paddr(vm_paddr_t pa)
-{
-
-	return (pa >= trunc_2mpage(btext - KERNBASE) &&
-	   pa < round_page(_end - KERNBASE));
-}
-
-static bool
-is_mpboot_good(vm_paddr_t start, vm_paddr_t end)
-{
-
-	return (start + AP_BOOTPT_SZ <= GiB(4) && atop(end) < Maxmem);
-}
-
-/*
- * Calculate usable address in base memory for AP trampoline code.
- */
-void
-mp_bootaddress(vm_paddr_t *physmap, unsigned int *physmap_idx)
-{
-	vm_paddr_t start, end;
-	unsigned int i;
-	bool allocated;
-
-	alloc_ap_trampoline(physmap, physmap_idx);
-
-	/*
-	 * Find a memory region big enough below the 4GB boundary to
-	 * store the initial page tables.  Region must be mapped by
-	 * the direct map.
-	 *
-	 * Note that it needs to be aligned to a page boundary.
-	 */
-	allocated = false;
-	for (i = *physmap_idx; i <= *physmap_idx; i -= 2) {
-		/*
-		 * First, try to chomp at the start of the physmap region.
-		 * Kernel binary might claim it already.
-		 */
-		start = round_page(physmap[i]);
-		end = start + AP_BOOTPT_SZ;
-		if (start < end && end <= physmap[i + 1] &&
-		    is_mpboot_good(start, end) &&
-		    !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
-			allocated = true;
-			physmap[i] = end;
-			break;
-		}
-
-		/*
-		 * Second, try to chomp at the end.  Again, check
-		 * against kernel.
-		 */
-		end = trunc_page(physmap[i + 1]);
-		start = end - AP_BOOTPT_SZ;
-		if (start < end && start >= physmap[i] &&
-		    is_mpboot_good(start, end) &&
-		    !is_kernel_paddr(start) && !is_kernel_paddr(end - 1)) {
-			allocated = true;
-			physmap[i + 1] = start;
-			break;
-		}
-	}
-	if (allocated) {
-		mptramp_pagetables = start;
-		if (physmap[i] == physmap[i + 1] && *physmap_idx != 0) {
-			memmove(&physmap[i], &physmap[i + 2],
-			    sizeof(*physmap) * (*physmap_idx - i + 2));
-			*physmap_idx -= 2;
-		}
-	} else {
-		mptramp_pagetables = trunc_page(boot_address) - AP_BOOTPT_SZ;
-		if (bootverbose)
-			printf(
-"Cannot find enough space for the initial AP page tables, placing them at %#x",
-			    mptramp_pagetables);
-	}
-}
-
 /*
  * Initialize the IPI handlers and start up the AP's.
  */
@@ -244,6 +165,9 @@ cpu_mp_start(void)
 	assign_cpu_ids();
 
 	mptramp_la57 = la57;
+	mptramp_nx = pg_nx != 0;
+	MPASS(kernel_pmap->pm_cr3 < (1UL << 32));
+	mptramp_pagetables = kernel_pmap->pm_cr3;
 
 	/* Start each Application Processor */
 	init_ops.start_all_aps();
@@ -398,55 +322,67 @@ mp_realloc_pcpu(int cpuid, int domain)
 int
 native_start_all_aps(void)
 {
-	u_int64_t *pt5, *pt4, *pt3, *pt2;
+	vm_page_t m_pml4, m_pdp, m_pd[4];
+	pml5_entry_t old_pml45;
+	pml4_entry_t *v_pml4;
+	pdp_entry_t *v_pdp;
+	pd_entry_t *v_pd;
 	u_int32_t mpbioswarmvec;
-	int apic_id, cpu, domain, i, xo;
+	int apic_id, cpu, domain, i;
 	u_char mpbiosreason;
 
 	mtx_init(&ap_boot_mtx, "ap boot", NULL, MTX_SPIN);
 
-	/* copy the AP 1st level boot code */
-	bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
-
-	/* Locate the page tables, they'll be below the trampoline */
+	/* Create a transient 1:1 mapping of low 4G */
 	if (la57) {
-		pt5 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables);
-		xo = 1;
+		m_pml4 = pmap_page_alloc_below_4g(true);
+		v_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pml4));
 	} else {
-		xo = 0;
+		v_pml4 = &kernel_pmap->pm_pmltop[0];
 	}
-	pt4 = (uint64_t *)PHYS_TO_DMAP(mptramp_pagetables + xo * PAGE_SIZE);
-	pt3 = pt4 + (PAGE_SIZE) / sizeof(u_int64_t);
-	pt2 = pt3 + (PAGE_SIZE) / sizeof(u_int64_t);
-
-	/* Create the initial 1GB replicated page tables */
-	for (i = 0; i < 512; i++) {
-		if (la57) {
-			pt5[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
-			    PAGE_SIZE);
-			pt5[i] |= PG_V | PG_RW | PG_U;
-		}
-
-		/*
-		 * Each slot of the level 4 pages points to the same
-		 * level 3 page.
-		 */
-		pt4[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
-		    (xo + 1) * PAGE_SIZE);
-		pt4[i] |= PG_V | PG_RW | PG_U;
-
-		/*
-		 * Each slot of the level 3 pages points to the same
-		 * level 2 page.
-		 */
-		pt3[i] = (u_int64_t)(uintptr_t)(mptramp_pagetables +
-		    ((xo + 2) * PAGE_SIZE));
-		pt3[i] |= PG_V | PG_RW | PG_U;
-
-		/* The level 2 page slots are mapped with 2MB pages for 1GB. */
-		pt2[i] = i * (2 * 1024 * 1024);
-		pt2[i] |= PG_V | PG_RW | PG_PS | PG_U;
+	m_pdp = pmap_page_alloc_below_4g(true);
+	v_pdp = (pdp_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pdp));
+	m_pd[0] = pmap_page_alloc_below_4g(false);
+	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[0]));
+	for (i = 0; i < NPDEPG; i++)
+		v_pd[i] = (i << PDRSHIFT) | X86_PG_V | X86_PG_RW | X86_PG_A |
+		    X86_PG_M | PG_PS;
+	m_pd[1] = pmap_page_alloc_below_4g(false);
+	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[1]));
+	for (i = 0; i < NPDEPG; i++)
+		v_pd[i] = (NBPDP + (i << PDRSHIFT)) | X86_PG_V | X86_PG_RW |
+		    X86_PG_A | X86_PG_M | PG_PS;
+	m_pd[2] = pmap_page_alloc_below_4g(false);
+	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[2]));
+	for (i = 0; i < NPDEPG; i++)
+		v_pd[i] = (2UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
+		    X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
+	m_pd[3] = pmap_page_alloc_below_4g(false);
+	v_pd = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m_pd[3]));
+	for (i = 0; i < NPDEPG; i++)
+		v_pd[i] = (3UL * NBPDP + (i << PDRSHIFT)) | X86_PG_V |
+		    X86_PG_RW | X86_PG_A | X86_PG_M | PG_PS;
+	v_pdp[0] = VM_PAGE_TO_PHYS(m_pd[0]) | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M;
+	v_pdp[1] = VM_PAGE_TO_PHYS(m_pd[1]) | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M;
+	v_pdp[2] = VM_PAGE_TO_PHYS(m_pd[2]) | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M;
+	v_pdp[3] = VM_PAGE_TO_PHYS(m_pd[3]) | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M;
+	old_pml45 = kernel_pmap->pm_pmltop[0];
+	if (la57) {
+		kernel_pmap->pm_pmltop[0] = VM_PAGE_TO_PHYS(m_pml4) |
+		    X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 	}
+	v_pml4[0] = VM_PAGE_TO_PHYS(m_pdp) | X86_PG_V |
+	    X86_PG_RW | X86_PG_A | X86_PG_M;
+	pmap_invalidate_all(kernel_pmap);
+
+	/* copy the AP 1st level boot code */
+	bcopy(mptramp_start, (void *)PHYS_TO_DMAP(boot_address), bootMP_size);
+	if (bootverbose)
+		printf("AP boot address %#x\n", boot_address);
 
 	/* save the current value of the warm-start vector */
 	if (!efi_boot)
@@ -517,6 +453,17 @@ native_start_all_aps(void)
 	outb(CMOS_REG, BIOS_RESET);
 	outb(CMOS_DATA, mpbiosreason);
 
+	/* Destroy transient 1:1 mapping */
+	kernel_pmap->pm_pmltop[0] = old_pml45;
+	invlpg(0);
+	if (la57)
+		vm_page_free(m_pml4);
+	vm_page_free(m_pd[3]);
+	vm_page_free(m_pd[2]);
+	vm_page_free(m_pd[1]);
+	vm_page_free(m_pd[0]);
+	vm_page_free(m_pdp);
+
 	/* number of APs actually started */
 	return (mp_naps);
 }
diff --git a/sys/amd64/amd64/mpboot.S b/sys/amd64/amd64/mpboot.S
index afdcffa573a4..1b5657d3bef8 100644
--- a/sys/amd64/amd64/mpboot.S
+++ b/sys/amd64/amd64/mpboot.S
@@ -95,12 +95,25 @@ protmode:
 	 * is later enabled.
 	 */
 	mov	%cr4, %eax
-	orl	$CR4_PAE, %eax
+	orl	$(CR4_PAE | CR4_PGE), %eax
 	cmpb	$0, mptramp_la57-mptramp_start(%ebx)
 	je	1f
 	orl	$CR4_LA57, %eax
 1:	mov	%eax, %cr4
 
+	/*
+	 * If the BSP reported NXE support, enable EFER.NXE for all APs
+	 * prior to loading %cr3. This avoids page faults if the AP
+	 * encounters memory marked with the NX bit prior to detecting and
+	 * enabling NXE support.
+	 */
+	cmpb	$0,mptramp_nx-mptramp_start(%ebx)
+	je	2f
+	movl	$MSR_EFER, %ecx
+	rdmsr
+	orl	$EFER_NXE, %eax
+	wrmsr
+2:
 	/*
 	 * Enable EFER.LME so that we get long mode when all the prereqs are
 	 * in place.  In this case, it turns on when CR0_PG is finally enabled.
@@ -112,12 +125,13 @@ protmode:
 	wrmsr
 
 	/*
-	 * Point to the embedded page tables for startup.  Note that this
-	 * only gets accessed after we're actually in 64 bit mode, however
-	 * we can only set the bottom 32 bits of %cr3 in this state.  This
-	 * means we are required to use a temporary page table that is below
-	 * the 4GB limit.  %ebx is still our relocation base.  We could just
-	 * subtract 3 * PAGE_SIZE, but that would be too easy.
+	 * Load kernel page table pointer into %cr3.
+	 * %ebx is still our relocation base.
+	 *
+	 * Note that this only gets accessed after we're actually in 64 bit
+	 * mode, however we can only set the bottom 32 bits of %cr3 in this
+	 * state.  This means we depend on the kernel page table being
+	 * allocated from the low 4G.
 	 */
 	leal	mptramp_pagetables-mptramp_start(%ebx),%eax
 	movl	(%eax), %eax
@@ -155,10 +169,8 @@ jmp_64:
 	/*
 	 * Yeehar!  We're running in 64 bit mode!  We can mostly ignore our
 	 * segment registers, and get on with it.
-	 * Note that we are running at the correct virtual address, but with
-	 * a 1:1 1GB mirrored mapping over entire address space.  We had better
-	 * switch to a real %cr3 promptly so that we can get to the direct map
-	 * space. Remember that jmp is relative and that we've been relocated,
+	 * We are running at the correct virtual address space.
+	 * Note that the jmp is relative and that we've been relocated,
 	 * so use an indirect jump.
 	 */
 	.code64
@@ -220,6 +232,10 @@ mptramp_pagetables:
 mptramp_la57:
 	.long	0
 
+	.globl	mptramp_nx
+mptramp_nx:
+	.long	0
+
 	/*
 	 * The pseudo descriptor for lgdt to use.
 	 */
@@ -243,32 +259,6 @@ bootMP_size:
 	.code64
 	.p2align 4,0
 entry_64:
-	/*
-	 * If the BSP reported NXE support, enable EFER.NXE for all APs
-	 * prior to loading %cr3. This avoids page faults if the AP
-	 * encounters memory marked with the NX bit prior to detecting and
-	 * enabling NXE support.
-	 */
-	movq	pg_nx, %rbx
-	testq	%rbx, %rbx
-	je	1f
-	movl	$MSR_EFER, %ecx
-	rdmsr
-	orl	$EFER_NXE, %eax
-	wrmsr
-
-1:
-	/*
-	 * Load a real %cr3 that has all the direct map stuff and switches
-	 * off the 1GB replicated mirror.  Load a stack pointer and jump
-	 * into AP startup code in C.
-	*/
-	cmpl	$0, la57
-	jne	2f
-	movq	KPML4phys, %rax
-	jmp	3f
-2:	movq	KPML5phys, %rax
-3:	movq	%rax, %cr3
 	movq	bootSTK, %rsp
 
 	/*
diff --git a/sys/amd64/include/smp.h b/sys/amd64/include/smp.h
index 8fbd89da0e57..84ee73cef723 100644
--- a/sys/amd64/include/smp.h
+++ b/sys/amd64/include/smp.h
@@ -38,8 +38,7 @@ inthand_t
 	IDTVEC(rendezvous_pti);
 
 void	invlop_handler(void);
-int	native_start_all_aps(void);
-void	mp_bootaddress(vm_paddr_t *, unsigned int *);
+int native_start_all_aps(void);
 
 #endif /* !LOCORE */
 #endif /* SMP */
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index f1c1e45e79b8..441a766f87fb 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1070,11 +1070,6 @@ init_secondary_tail(void)
 	}
 
 #ifdef __amd64__
-	/*
-	 * Enable global pages TLB extension
-	 * This also implicitly flushes the TLB 
-	 */
-	load_cr4(rcr4() | CR4_PGE);
 	if (pmap_pcid_enabled)
 		load_cr4(rcr4() | CR4_PCIDE);
 	load_ds(_udatasel);
diff --git a/sys/x86/xen/pv.c b/sys/x86/xen/pv.c
index 2fd698772f9d..59c5b464aace 100644
--- a/sys/x86/xen/pv.c
+++ b/sys/x86/xen/pv.c
@@ -134,7 +134,6 @@ struct init_ops xen_pvh_init_ops = {
 	.early_delay			= xen_delay,
 	.parse_memmap			= xen_pvh_parse_memmap,
 #ifdef SMP
-	.mp_bootaddress			= mp_bootaddress,
 	.start_all_aps			= native_start_all_aps,
 #endif
 	.msi_init			= msi_init,


More information about the dev-commits-src-branches mailing list