git: f6b799a86b8f - main - Fix the test used to wait for AP startup on x86, arm64, riscv

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Wed, 15 Jun 2022 15:39:25 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=f6b799a86b8fb69013c10b072baed4ee6c8db2f9

commit f6b799a86b8fb69013c10b072baed4ee6c8db2f9
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2022-06-15 14:29:39 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2022-06-15 15:38:04 +0000

    Fix the test used to wait for AP startup on x86, arm64, riscv
    
    On arm64, testing pc_curpcb != NULL is not correct since pc_curpcb is
    set in pmap_switch() while the bootstrap stack is still in use.  As a
    result, smp_after_idle_runnable() can free the boot stack prematurely.
    
    Take a different approach: use smp_rendezvous() to wait for all APs to
    acknowledge an interrupt.  Since APs must not enable interrupts until
    they've entered the scheduler, i.e., switched off the boot stack, this
    provides the right guarantee without depending as much on the
    implementation of cpu_throw().  And, this approach applies to all
    platforms, so convert x86 and riscv as well.
    
    Reported by:    mmel
    Tested by:      mmel
    Reviewed by:    kib
    Fixes:          8db2e8fd16c4 ("Remove the secondary_stacks array in arm64 and riscv kernels.")
    MFC after:      2 weeks
    Sponsored by:   The FreeBSD Foundation
    Differential Revision:  https://reviews.freebsd.org/D35435
---
 sys/arm64/arm64/mp_machdep.c | 27 +++++++++++++++------------
 sys/riscv/riscv/mp_machdep.c | 27 +++++++++++++++------------
 sys/x86/x86/mp_x86.c         | 22 +++++++++++++---------
 3 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c
index 74281f9a88e5..adf5592832c5 100644
--- a/sys/arm64/arm64/mp_machdep.c
+++ b/sys/arm64/arm64/mp_machdep.c
@@ -183,7 +183,7 @@ release_aps(void *dummy __unused)
 
 	started = 0;
 	for (i = 0; i < 2000; i++) {
-		if (smp_started) {
+		if (atomic_load_acq_int(&smp_started) != 0) {
 			printf("done\n");
 			return;
 		}
@@ -290,11 +290,6 @@ init_secondary(uint64_t cpu)
 
 	kcsan_cpu_init(cpu);
 
-	/*
-	 * Assert that smp_after_idle_runnable condition is reasonable.
-	 */
-	MPASS(PCPU_GET(curpcb) == NULL);
-
 	/* Enter the scheduler */
 	sched_ap_entry();
 
@@ -305,16 +300,24 @@ init_secondary(uint64_t cpu)
 static void
 smp_after_idle_runnable(void *arg __unused)
 {
-	struct pcpu *pc;
 	int cpu;
 
+	if (mp_ncpus == 1)
+		return;
+
+	KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__));
+
+	/*
+	 * Wait for all APs to handle an interrupt.  After that, we know that
+	 * the APs have entered the scheduler at least once, so the boot stacks
+	 * are safe to free.
+	 */
+	smp_rendezvous(smp_no_rendezvous_barrier, NULL,
+	    smp_no_rendezvous_barrier, NULL);
+
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
-		if (bootstacks[cpu] != NULL) {
-			pc = pcpu_find(cpu);
-			while (atomic_load_ptr(&pc->pc_curpcb) == NULL)
-				cpu_spinwait();
+		if (bootstacks[cpu] != NULL)
 			kmem_free((vm_offset_t)bootstacks[cpu], PAGE_SIZE);
-		}
 	}
 }
 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
diff --git a/sys/riscv/riscv/mp_machdep.c b/sys/riscv/riscv/mp_machdep.c
index c460eb033799..f0048284a0c7 100644
--- a/sys/riscv/riscv/mp_machdep.c
+++ b/sys/riscv/riscv/mp_machdep.c
@@ -210,7 +210,7 @@ release_aps(void *dummy __unused)
 	sbi_send_ipi(mask.__bits);
 
 	for (i = 0; i < 2000; i++) {
-		if (smp_started)
+		if (atomic_load_acq_int(&smp_started))
 			return;
 		DELAY(1000);
 	}
@@ -284,11 +284,6 @@ init_secondary(uint64_t hart)
 
 	mtx_unlock_spin(&ap_boot_mtx);
 
-	/*
-	 * Assert that smp_after_idle_runnable condition is reasonable.
-	 */
-	MPASS(PCPU_GET(curpcb) == NULL);
-
 	/* Enter the scheduler */
 	sched_ap_entry();
 
@@ -299,16 +294,24 @@ init_secondary(uint64_t hart)
 static void
 smp_after_idle_runnable(void *arg __unused)
 {
-	struct pcpu *pc;
 	int cpu;
 
+	if (mp_ncpus == 1)
+		return;
+
+	KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__));
+
+	/*
+	 * Wait for all APs to handle an interrupt.  After that, we know that
+	 * the APs have entered the scheduler at least once, so the boot stacks
+	 * are safe to free.
+	 */
+	smp_rendezvous(smp_no_rendezvous_barrier, NULL,
+	    smp_no_rendezvous_barrier, NULL);
+
 	for (cpu = 1; cpu <= mp_maxid; cpu++) {
-		if (bootstacks[cpu] != NULL) {
-			pc = pcpu_find(cpu);
-			while (atomic_load_ptr(&pc->pc_curpcb) == NULL)
-				cpu_spinwait();
+		if (bootstacks[cpu] != NULL)
 			kmem_free((vm_offset_t)bootstacks[cpu], PAGE_SIZE);
-		}
 	}
 }
 SYSINIT(smp_after_idle_runnable, SI_SUB_SMP, SI_ORDER_ANY,
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index 3ca11700f2f2..6dab7987e208 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1129,11 +1129,6 @@ init_secondary_tail(void)
 
 	kcsan_cpu_init(cpuid);
 
-	/*
-	 * Assert that smp_after_idle_runnable condition is reasonable.
-	 */
-	MPASS(PCPU_GET(curpcb) == NULL);
-
 	sched_ap_entry();
 
 	panic("scheduler returned us to %s", __func__);
@@ -1143,13 +1138,22 @@ init_secondary_tail(void)
 static void
 smp_after_idle_runnable(void *arg __unused)
 {
-	struct pcpu *pc;
 	int cpu;
 
+	if (mp_ncpus == 1)
+		return;
+
+	KASSERT(smp_started != 0, ("%s: SMP not started yet", __func__));
+
+	/*
+	 * Wait for all APs to handle an interrupt.  After that, we know that
+	 * the APs have entered the scheduler at least once, so the boot stacks
+	 * are safe to free.
+	 */
+	smp_rendezvous(smp_no_rendezvous_barrier, NULL,
+	    smp_no_rendezvous_barrier, NULL);
+
 	for (cpu = 1; cpu < mp_ncpus; cpu++) {
-		pc = pcpu_find(cpu);
-		while (atomic_load_ptr(&pc->pc_curpcb) == NULL)
-			cpu_spinwait();
 		kmem_free((vm_offset_t)bootstacks[cpu], kstack_pages *
 		    PAGE_SIZE);
 	}