git: 589aed00e36c - main - sched: separate out schedinit_ap()

From: Kyle Evans <kevans_at_FreeBSD.org>
Date: Wed, 03 Nov 2021 20:55:21 UTC
The branch main has been updated by kevans:

URL: https://cgit.FreeBSD.org/src/commit/?id=589aed00e36c22733d3fd9c9016deccf074830b1

commit 589aed00e36c22733d3fd9c9016deccf074830b1
Author:     Kyle Evans <kevans@FreeBSD.org>
AuthorDate: 2021-11-02 18:06:47 +0000
Commit:     Kyle Evans <kevans@FreeBSD.org>
CommitDate: 2021-11-03 20:54:59 +0000

    sched: separate out schedinit_ap()
    
    schedinit_ap() sets up an AP for a later call to sched_throw(NULL).
    
    Currently, ULE sets up some pcpu bits and fixes the idlethread lock with
    a call to sched_throw(NULL); this results in a window where curthread is
    setup in platforms' init_secondary(), but it has the wrong td_lock.
    Typical platform AP startup procedure looks something like:
    
    - Setup curthread
    - ... other stuff, including cpu_initclocks_ap()
    - Signal smp_started
    - sched_throw(NULL) to enter the scheduler
    
    cpu_initclocks_ap() may have callouts to process (e.g., nvme) and
    attempt to sched_add() for this AP, but this attempt fails because
    of the noted violated assumption leading to locking heartburn in
    sched_setpreempt().
    
    Interrupts are still disabled until cpu_throw() so we're not really at
    risk of being preempted -- just let the scheduler in on it a little
    earlier as part of setting up curthread.
    
    Reviewed by:    alfredo, kib, markj
    Triage help from:       andrew, markj
    Smoke-tested by:        alfredo (ppc), kevans (arm64, x86), mhorne (arm)
    Differential Revision:  https://reviews.freebsd.org/D32797
---
 sys/arm/arm/mp_machdep.c      |  1 +
 sys/arm64/arm64/mp_machdep.c  |  1 +
 sys/kern/sched_4bsd.c         |  7 +++++++
 sys/kern/sched_ule.c          | 29 ++++++++++++++++++++++-------
 sys/mips/mips/mp_machdep.c    |  1 +
 sys/powerpc/aim/mp_cpudep.c   |  2 ++
 sys/powerpc/booke/mp_cpudep.c |  2 ++
 sys/riscv/riscv/mp_machdep.c  |  1 +
 sys/sys/sched.h               |  5 +++++
 sys/x86/x86/mp_x86.c          |  1 +
 10 files changed, 43 insertions(+), 7 deletions(-)

diff --git a/sys/arm/arm/mp_machdep.c b/sys/arm/arm/mp_machdep.c
index 6368f7b72da9..4089af5929eb 100644
--- a/sys/arm/arm/mp_machdep.c
+++ b/sys/arm/arm/mp_machdep.c
@@ -182,6 +182,7 @@ init_secondary(int cpu)
 	pc->pc_curthread = pc->pc_idlethread;
 	pc->pc_curpcb = pc->pc_idlethread->td_pcb;
 	set_curthread(pc->pc_idlethread);
+	schedinit_ap();
 #ifdef VFP
 	vfp_init();
 #endif
diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c
index 15e05ef46262..b42f65b9e399 100644
--- a/sys/arm64/arm64/mp_machdep.c
+++ b/sys/arm64/arm64/mp_machdep.c
@@ -255,6 +255,7 @@ init_secondary(uint64_t cpu)
 	/* Initialize curthread */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	pcpup->pc_curthread = pcpup->pc_idlethread;
+	schedinit_ap();
 
 	/* Initialize curpmap to match TTBR0's current setting. */
 	pmap0 = vmspace_pmap(&vmspace0);
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index ddd65b94f0ff..6ba41eb80dcc 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -678,6 +678,13 @@ schedinit(void)
 	mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN);
 }
 
+void
+schedinit_ap(void)
+{
+
+	/* Nothing needed. */
+}
+
 int
 sched_runnable(void)
 {
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 1b9473b93773..ce7ce4cd2bd8 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -1743,6 +1743,26 @@ schedinit(void)
 	ts0->ts_cpu = curcpu;	/* set valid CPU number */
 }
 
+/*
+ * schedinit_ap() is needed prior to calling sched_throw(NULL) to ensure that
+ * the pcpu requirements are met for any calls in the period between curthread
+ * initialization and sched_throw().  One can safely add threads to the queue
+ * before sched_throw(), for instance, as long as the thread lock is setup
+ * correctly.
+ *
+ * TDQ_SELF() relies on the below sched pcpu setting; it may be used only
+ * after schedinit_ap().
+ */
+void
+schedinit_ap(void)
+{
+
+#ifdef SMP
+	PCPU_SET(sched, DPCPU_PTR(tdq));
+#endif
+	PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(TDQ_SELF());
+}
+
 /*
  * This is only somewhat accurate since given many processes of the same
  * priority they will switch when their slices run out, which will be
@@ -2973,19 +2993,14 @@ sched_throw(struct thread *td)
 	struct thread *newtd;
 	struct tdq *tdq;
 
+	tdq = TDQ_SELF();
 	if (__predict_false(td == NULL)) {
-#ifdef SMP
-		PCPU_SET(sched, DPCPU_PTR(tdq));
-#endif
-		/* Correct spinlock nesting and acquire the correct lock. */
-		tdq = TDQ_SELF();
 		TDQ_LOCK(tdq);
+		/* Correct spinlock nesting. */
 		spinlock_exit();
 		PCPU_SET(switchtime, cpu_ticks());
 		PCPU_SET(switchticks, ticks);
-		PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(tdq);
 	} else {
-		tdq = TDQ_SELF();
 		THREAD_LOCK_ASSERT(td, MA_OWNED);
 		THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(tdq));
 		tdq_load_rem(tdq, td);
diff --git a/sys/mips/mips/mp_machdep.c b/sys/mips/mips/mp_machdep.c
index 1a5a023db381..dc089db1d189 100644
--- a/sys/mips/mips/mp_machdep.c
+++ b/sys/mips/mips/mp_machdep.c
@@ -311,6 +311,7 @@ smp_init_secondary(u_int32_t cpuid)
 	/* Initialize curthread. */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	PCPU_SET(curthread, PCPU_GET(idlethread));
+	schedinit_ap();
 
 	mtx_lock_spin(&ap_boot_mtx);
 
diff --git a/sys/powerpc/aim/mp_cpudep.c b/sys/powerpc/aim/mp_cpudep.c
index 33aae520c4b2..a73246487683 100644
--- a/sys/powerpc/aim/mp_cpudep.c
+++ b/sys/powerpc/aim/mp_cpudep.c
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/bus.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
+#include <sys/sched.h>
 #include <sys/smp.h>
 
 #include <machine/bus.h>
@@ -134,6 +135,7 @@ cpudep_ap_bootstrap(void)
 #endif
 	pcpup->pc_curpcb = pcpup->pc_curthread->td_pcb;
 	sp = pcpup->pc_curpcb->pcb_sp;
+	schedinit_ap();
 
 	return (sp);
 }
diff --git a/sys/powerpc/booke/mp_cpudep.c b/sys/powerpc/booke/mp_cpudep.c
index c07e8c06ce05..709578d8e1b4 100644
--- a/sys/powerpc/booke/mp_cpudep.c
+++ b/sys/powerpc/booke/mp_cpudep.c
@@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/bus.h>
 #include <sys/pcpu.h>
 #include <sys/proc.h>
+#include <sys/sched.h>
 #include <sys/smp.h>
 
 #include <machine/pcb.h>
@@ -85,6 +86,7 @@ cpudep_ap_bootstrap()
 #endif
 	pcpup->pc_curpcb = pcpup->pc_curthread->td_pcb;
 	sp = pcpup->pc_curpcb->pcb_sp;
+	schedinit_ap();
 
 	/* XXX shouldn't the pcb_sp be checked/forced for alignment here?? */
 
diff --git a/sys/riscv/riscv/mp_machdep.c b/sys/riscv/riscv/mp_machdep.c
index 1dadf19ce51f..57d5606a3b88 100644
--- a/sys/riscv/riscv/mp_machdep.c
+++ b/sys/riscv/riscv/mp_machdep.c
@@ -248,6 +248,7 @@ init_secondary(uint64_t hart)
 	/* Initialize curthread */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	pcpup->pc_curthread = pcpup->pc_idlethread;
+	schedinit_ap();
 
 	/*
 	 * Identify current CPU. This is necessary to setup
diff --git a/sys/sys/sched.h b/sys/sys/sched.h
index 64651ffa9c90..8041a2bc12d4 100644
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -226,6 +226,11 @@ SYSINIT(name, SI_SUB_LAST, SI_ORDER_MIDDLE, name ## _add_proc, NULL);
  * Fixup scheduler state for proc0 and thread0
  */
 void schedinit(void);
+
+/*
+ * Fixup scheduler state for secondary APs
+ */
+void schedinit_ap(void);
 #endif /* _KERNEL */
 
 /* POSIX 1003.1b Process Scheduling */
diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
index ca1125886619..1fac244cbed7 100644
--- a/sys/x86/x86/mp_x86.c
+++ b/sys/x86/x86/mp_x86.c
@@ -1040,6 +1040,7 @@ init_secondary_tail(void)
 	/* Initialize curthread. */
 	KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
 	PCPU_SET(curthread, PCPU_GET(idlethread));
+	schedinit_ap();
 
 	mtx_lock_spin(&ap_boot_mtx);