Re: git: 589aed00e36c - main - sched: separate out schedinit_ap()

From: Kyle Evans <kevans_at_freebsd.org>
Date: Wed, 17 Nov 2021 22:44:29 UTC
On Wed, Nov 3, 2021 at 3:55 PM Kyle Evans <kevans@freebsd.org> wrote:
>
> The branch main has been updated by kevans:
>
> URL: https://cgit.FreeBSD.org/src/commit/?id=589aed00e36c22733d3fd9c9016deccf074830b1
>
> commit 589aed00e36c22733d3fd9c9016deccf074830b1
> Author:     Kyle Evans <kevans@FreeBSD.org>
> AuthorDate: 2021-11-02 18:06:47 +0000
> Commit:     Kyle Evans <kevans@FreeBSD.org>
> CommitDate: 2021-11-03 20:54:59 +0000
>
>     sched: separate out schedinit_ap()
>
>     schedinit_ap() sets up an AP for a later call to sched_throw(NULL).
>
>     Currently, ULE sets up some pcpu bits and fixes the idlethread lock with
>     a call to sched_throw(NULL); this results in a window where curthread is
>     setup in platforms' init_secondary(), but it has the wrong td_lock.
>     Typical platform AP startup procedure looks something like:
>
>     - Setup curthread
>     - ... other stuff, including cpu_initclocks_ap()
>     - Signal smp_started
>     - sched_throw(NULL) to enter the scheduler
>
>     cpu_initclocks_ap() may have callouts to process (e.g., nvme) and
>     attempt to sched_add() for this AP, but this attempt fails because
>     of the noted violated assumption leading to locking heartburn in
>     sched_setpreempt().
>
>     Interrupts are still disabled until cpu_throw() so we're not really at
>     risk of being preempted -- just let the scheduler in on it a little
>     earlier as part of setting up curthread.
>

What's the general consensus on potential out-of-tree archs maintained
on stable branches? I'd like to MFC this at least to stable/13 to
avoid it being in the way of the nvme change that spurred it, and I'm
trying to decide if it should have something like this added to make
it safe:

diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 217d685b8587..f07f5e91d8f3 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -2995,6 +2995,11 @@ sched_throw(struct thread *td)

        tdq = TDQ_SELF();
        if (__predict_false(td == NULL)) {
+               if (tdq == NULL || PCPU_GET(idlethread)->td_lock !=
+                   TDQ_LOCKPTR(tdq)) {
+                       schedinit_ap();
+                       tdq = TDQ_SELF();
+               }
                TDQ_LOCK(tdq);
                /* Correct spinlock nesting. */
                spinlock_exit();


>  sys/arm/arm/mp_machdep.c      |  1 +
>  sys/arm64/arm64/mp_machdep.c  |  1 +
>  sys/kern/sched_4bsd.c         |  7 +++++++
>  sys/kern/sched_ule.c          | 29 ++++++++++++++++++++++-------
>  sys/mips/mips/mp_machdep.c    |  1 +
>  sys/powerpc/aim/mp_cpudep.c   |  2 ++
>  sys/powerpc/booke/mp_cpudep.c |  2 ++
>  sys/riscv/riscv/mp_machdep.c  |  1 +
>  sys/sys/sched.h               |  5 +++++
>  sys/x86/x86/mp_x86.c          |  1 +
>  10 files changed, 43 insertions(+), 7 deletions(-)
>
> diff --git a/sys/arm/arm/mp_machdep.c b/sys/arm/arm/mp_machdep.c
> index 6368f7b72da9..4089af5929eb 100644
> --- a/sys/arm/arm/mp_machdep.c
> +++ b/sys/arm/arm/mp_machdep.c
> @@ -182,6 +182,7 @@ init_secondary(int cpu)
>         pc->pc_curthread = pc->pc_idlethread;
>         pc->pc_curpcb = pc->pc_idlethread->td_pcb;
>         set_curthread(pc->pc_idlethread);
> +       schedinit_ap();
>  #ifdef VFP
>         vfp_init();
>  #endif
> diff --git a/sys/arm64/arm64/mp_machdep.c b/sys/arm64/arm64/mp_machdep.c
> index 15e05ef46262..b42f65b9e399 100644
> --- a/sys/arm64/arm64/mp_machdep.c
> +++ b/sys/arm64/arm64/mp_machdep.c
> @@ -255,6 +255,7 @@ init_secondary(uint64_t cpu)
>         /* Initialize curthread */
>         KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
>         pcpup->pc_curthread = pcpup->pc_idlethread;
> +       schedinit_ap();
>
>         /* Initialize curpmap to match TTBR0's current setting. */
>         pmap0 = vmspace_pmap(&vmspace0);
> diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
> index ddd65b94f0ff..6ba41eb80dcc 100644
> --- a/sys/kern/sched_4bsd.c
> +++ b/sys/kern/sched_4bsd.c
> @@ -678,6 +678,13 @@ schedinit(void)
>         mtx_init(&sched_lock, "sched lock", NULL, MTX_SPIN);
>  }
>
> +void
> +schedinit_ap(void)
> +{
> +
> +       /* Nothing needed. */
> +}
> +
>  int
>  sched_runnable(void)
>  {
> diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
> index 1b9473b93773..ce7ce4cd2bd8 100644
> --- a/sys/kern/sched_ule.c
> +++ b/sys/kern/sched_ule.c
> @@ -1743,6 +1743,26 @@ schedinit(void)
>         ts0->ts_cpu = curcpu;   /* set valid CPU number */
>  }
>
> +/*
> + * schedinit_ap() is needed prior to calling sched_throw(NULL) to ensure that
> + * the pcpu requirements are met for any calls in the period between curthread
> + * initialization and sched_throw().  One can safely add threads to the queue
> + * before sched_throw(), for instance, as long as the thread lock is setup
> + * correctly.
> + *
> + * TDQ_SELF() relies on the below sched pcpu setting; it may be used only
> + * after schedinit_ap().
> + */
> +void
> +schedinit_ap(void)
> +{
> +
> +#ifdef SMP
> +       PCPU_SET(sched, DPCPU_PTR(tdq));
> +#endif
> +       PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(TDQ_SELF());
> +}
> +
>  /*
>   * This is only somewhat accurate since given many processes of the same
>   * priority they will switch when their slices run out, which will be
> @@ -2973,19 +2993,14 @@ sched_throw(struct thread *td)
>         struct thread *newtd;
>         struct tdq *tdq;
>
> +       tdq = TDQ_SELF();
>         if (__predict_false(td == NULL)) {
> -#ifdef SMP
> -               PCPU_SET(sched, DPCPU_PTR(tdq));
> -#endif
> -               /* Correct spinlock nesting and acquire the correct lock. */
> -               tdq = TDQ_SELF();
>                 TDQ_LOCK(tdq);
> +               /* Correct spinlock nesting. */
>                 spinlock_exit();
>                 PCPU_SET(switchtime, cpu_ticks());
>                 PCPU_SET(switchticks, ticks);
> -               PCPU_GET(idlethread)->td_lock = TDQ_LOCKPTR(tdq);
>         } else {
> -               tdq = TDQ_SELF();
>                 THREAD_LOCK_ASSERT(td, MA_OWNED);
>                 THREAD_LOCKPTR_ASSERT(td, TDQ_LOCKPTR(tdq));
>                 tdq_load_rem(tdq, td);
> diff --git a/sys/mips/mips/mp_machdep.c b/sys/mips/mips/mp_machdep.c
> index 1a5a023db381..dc089db1d189 100644
> --- a/sys/mips/mips/mp_machdep.c
> +++ b/sys/mips/mips/mp_machdep.c
> @@ -311,6 +311,7 @@ smp_init_secondary(u_int32_t cpuid)
>         /* Initialize curthread. */
>         KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
>         PCPU_SET(curthread, PCPU_GET(idlethread));
> +       schedinit_ap();
>
>         mtx_lock_spin(&ap_boot_mtx);
>
> diff --git a/sys/powerpc/aim/mp_cpudep.c b/sys/powerpc/aim/mp_cpudep.c
> index 33aae520c4b2..a73246487683 100644
> --- a/sys/powerpc/aim/mp_cpudep.c
> +++ b/sys/powerpc/aim/mp_cpudep.c
> @@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
>  #include <sys/bus.h>
>  #include <sys/pcpu.h>
>  #include <sys/proc.h>
> +#include <sys/sched.h>
>  #include <sys/smp.h>
>
>  #include <machine/bus.h>
> @@ -134,6 +135,7 @@ cpudep_ap_bootstrap(void)
>  #endif
>         pcpup->pc_curpcb = pcpup->pc_curthread->td_pcb;
>         sp = pcpup->pc_curpcb->pcb_sp;
> +       schedinit_ap();
>
>         return (sp);
>  }
> diff --git a/sys/powerpc/booke/mp_cpudep.c b/sys/powerpc/booke/mp_cpudep.c
> index c07e8c06ce05..709578d8e1b4 100644
> --- a/sys/powerpc/booke/mp_cpudep.c
> +++ b/sys/powerpc/booke/mp_cpudep.c
> @@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$");
>  #include <sys/bus.h>
>  #include <sys/pcpu.h>
>  #include <sys/proc.h>
> +#include <sys/sched.h>
>  #include <sys/smp.h>
>
>  #include <machine/pcb.h>
> @@ -85,6 +86,7 @@ cpudep_ap_bootstrap()
>  #endif
>         pcpup->pc_curpcb = pcpup->pc_curthread->td_pcb;
>         sp = pcpup->pc_curpcb->pcb_sp;
> +       schedinit_ap();
>
>         /* XXX shouldn't the pcb_sp be checked/forced for alignment here?? */
>
> diff --git a/sys/riscv/riscv/mp_machdep.c b/sys/riscv/riscv/mp_machdep.c
> index 1dadf19ce51f..57d5606a3b88 100644
> --- a/sys/riscv/riscv/mp_machdep.c
> +++ b/sys/riscv/riscv/mp_machdep.c
> @@ -248,6 +248,7 @@ init_secondary(uint64_t hart)
>         /* Initialize curthread */
>         KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
>         pcpup->pc_curthread = pcpup->pc_idlethread;
> +       schedinit_ap();
>
>         /*
>          * Identify current CPU. This is necessary to setup
> diff --git a/sys/sys/sched.h b/sys/sys/sched.h
> index 64651ffa9c90..8041a2bc12d4 100644
> --- a/sys/sys/sched.h
> +++ b/sys/sys/sched.h
> @@ -226,6 +226,11 @@ SYSINIT(name, SI_SUB_LAST, SI_ORDER_MIDDLE, name ## _add_proc, NULL);
>   * Fixup scheduler state for proc0 and thread0
>   */
>  void schedinit(void);
> +
> +/*
> + * Fixup scheduler state for secondary APs
> + */
> +void schedinit_ap(void);
>  #endif /* _KERNEL */
>
>  /* POSIX 1003.1b Process Scheduling */
> diff --git a/sys/x86/x86/mp_x86.c b/sys/x86/x86/mp_x86.c
> index ca1125886619..1fac244cbed7 100644
> --- a/sys/x86/x86/mp_x86.c
> +++ b/sys/x86/x86/mp_x86.c
> @@ -1040,6 +1040,7 @@ init_secondary_tail(void)
>         /* Initialize curthread. */
>         KASSERT(PCPU_GET(idlethread) != NULL, ("no idle thread"));
>         PCPU_SET(curthread, PCPU_GET(idlethread));
> +       schedinit_ap();
>
>         mtx_lock_spin(&ap_boot_mtx);
>