git: c31c881f7eee - stable/13 - Fix the test used to wait for AP startup on x86, arm64, riscv

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Wed, 29 Jun 2022 14:40:56 UTC
The branch stable/13 has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=c31c881f7eee79ab7c5f820952cbf22af88da0db

commit c31c881f7eee79ab7c5f820952cbf22af88da0db
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2022-06-15 14:29:39 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2022-06-29 14:13:44 +0000

    Fix the test used to wait for AP startup on x86, arm64, riscv
    
    On arm64, testing pc_curpcb != NULL is not correct since pc_curpcb is
    set in pmap_switch() while the bootstrap stack is still in use.  As a
    result, smp_after_idle_runnable() can free the boot stack prematurely.
    
    Take a different approach: use smp_rendezvous() to wait for all APs to
    acknowledge an interrupt.  Since APs must not enable interrupts until
    they've entered the scheduler, i.e., switched off the boot stack, this
    provides the right guarantee without depending as much on the
    implementation of cpu_throw().  And, this approach applies to all
    platforms, so convert x86 and riscv as well.
    
    Reported by:    mmel
    Tested by:      mmel
    Reviewed by:    kib
    Fixes:          8db2e8fd16c4 ("Remove the secondary_stacks array in arm64 and riscv kernels.")
    Sponsored by:   The FreeBSD Foundation
    
    (cherry picked from commit f6b799a86b8fb69013c10b072baed4ee6c8db2f9)
---
 sys/arm64/arm64/mp_machdep.c | 27 +++++++++++++++------------
 sys/riscv/riscv/mp_machdep.c | 27 +++++++++++++++------------
 sys/x86/x86/mp_x86.c         | 22 +++++++++++++---------
 3 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c
index b42f65b9e399..3e47c60088a8 100644
--- a/sys/arm64/arm64/mp_machdep.c
+++ b/sys/arm64/arm64/mp_machdep.c
@@ -182,7 +182,7 @@ release_aps(void *dummy __unused)
 
 	started = 0;
 	for (i = 0; i < 2000; i++) {
-		if (smp_started) {
+		if (atomic_load_acq_int(&smp_started) != 0) {
 			printf("done\n");
 			return;
 		}
@@ -287,11 +287,6 @@ init_secondary(uint64_t cpu)
 
 	kcsan_cpu_init(cpu);
 
-	/*
-	 * Assert that smp_after_idle_runnable condition is reasonable.
-	 */
-	MPASS(PCPU_GET(curpcb) == NULL);
-
 	/* Enter the scheduler */
 	sched_throw(NULL);
 
@@ -302,16 +297,24 @@ init_secondary(uint64_t cpu)
 static void
 smp_after_idle_runnable(void *arg __unused)
 {
-	struct pcpu *pc;
 	int cpu;
 
+	if (mp_ncpus == 1)
+		return;
+
+	KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__));
+
+	/*
+	 * Wait for all APs to handle an interrupt.  After that, we know that
+	 * the APs have entered the scheduler at least once, so the boot stacks
+	 * are safe to free.
+	 */
+	smp_rendezvous(smp_no_rendezvous_barrier, NULL,
+	    smp_no_rendezvous_barrier, NULL);
+
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
-		if (bootstacks[cpu] != NULL) {
-			pc = pcpu_find(cpu);
-			while (atomic_load_ptr(&pc->pc_curpcb) == NULL)
-				cpu_spinwait();
+		if (bootstacks[cpu] != NULL)
 			kmem_free((vm_offset_t)bootstacks[cpu], PAGE_SIZE);
-		}
 	}
 }
 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
diff --git a/sys/riscv/riscv/mp_machdep.c b/sys/riscv/riscv/mp_machdep.c
index 6a66b7eb80f7..e038b65d9aba 100644
--- a/sys/riscv/riscv/mp_machdep.c
+++ b/sys/riscv/riscv/mp_machdep.c
@@ -211,7 +211,7 @@ release_aps(void *dummy __unused)
 	sbi_send_ipi(mask.__bits);
 
 	for (i = 0; i < 2000; i++) {
-		if (smp_started)
+		if (atomic_load_acq_int(&smp_started))
 			return;
 		DELAY(1000);
 	}
@@ -285,11 +285,6 @@ init_secondary(uint64_t hart)
 
 	mtx_unlock_spin(&ap_boot_mtx);
 
-	/*
-	 * Assert that smp_after_idle_runnable condition is reasonable.
-	 */
-	MPASS(PCPU_GET(curpcb) == NULL);
-
 	/* Enter the scheduler */
 	sched_throw(NULL);
 
@@ -300,16 +295,24 @@ init_secondary(uint64_t hart)
 static void
 smp_after_idle_runnable(void *arg __unused)
 {
-	struct pcpu *pc;
 	int cpu;
 
+	if (mp_ncpus == 1)
+		return;
+
+	KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__));
+
+	/*
+	 * Wait for all APs to handle an interrupt.  After that, we know that
+	 * the APs have entered the scheduler at least once, so the boot stacks
+	 * are safe to free.
+	 */
+	smp_rendezvous(smp_no_rendezvous_barrier, NULL,
+	    smp_no_rendezvous_barrier, NULL);
+
 	for (cpu = 1; cpu <= mp_maxid; cpu++) {
-		if (bootstacks[cpu] != NULL) {
-			pc = pcpu_find(cpu);
-			while (atomic_load_ptr(&pc->pc_curpcb) == NULL)
-				cpu_spinwait();
+		if (bootstacks[cpu] != NULL)
 			kmem_free((vm_offset_t)bootstacks[cpu], PAGE_SIZE);
-		}
 	}
 }
 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index 4854331af17c..289885fa6213 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1103,11 +1103,6 @@ init_secondary_tail(void)
 
 	kcsan_cpu_init(cpuid);
 
-	/*
-	 * Assert that smp_after_idle_runnable condition is reasonable.
-	 */
-	MPASS(PCPU_GET(curpcb) == NULL);
-
 	sched_throw(NULL);
 
 	panic("scheduler returned us to %s", __func__);
@@ -1117,13 +1112,22 @@ init_secondary_tail(void)
 static void
 smp_after_idle_runnable(void *arg __unused)
 {
-	struct pcpu *pc;
 	int cpu;
 
+	if (mp_ncpus == 1)
+		return;
+
+	KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__));
+
+	/*
+	 * Wait for all APs to handle an interrupt.  After that, we know that
+	 * the APs have entered the scheduler at least once, so the boot stacks
+	 * are safe to free.
+	 */
+	smp_rendezvous(smp_no_rendezvous_barrier, NULL,
+	    smp_no_rendezvous_barrier, NULL);
+
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
-		pc = pcpu_find(cpu);
-		while (atomic_load_ptr(&pc->pc_curpcb) == NULL)
-			cpu_spinwait();
 		kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages *
 		    PAGE_SIZE);
 	}