git: b249cb2b18b3 - main - SCHED_4BSD: maybe_resched() cannot schedule ast() for curthread

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Thu, 29 Jan 2026 23:26:50 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=b249cb2b18b3fddae186d45fe6d984fc7bde10c4

commit b249cb2b18b3fddae186d45fe6d984fc7bde10c4
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2026-01-29 08:09:00 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2026-01-29 23:26:16 +0000

    SCHED_4BSD: maybe_resched() cannot schedule ast() for curthread
    
    maybe_resched() needs to schedule TDA_SCHED for curthread, but this
    requires taking curthread lock while owning some other thread lock.
    To avoid introducing the order:
    - Use a scheduler-private TDP flag.
    - Register an unconditional TDA_SCHED_PRIV for 4BSD.
    When an AST needs to be scheduled, i.e. the current thread must do
    context switch in the return to userspace path, set the flag.
    Then the ast handler calls ast_scheduler(), which gives the same
    effect as scheduling TDA_AST.
    
    The overhead is a single function call on each userspace return, for
    4BSD case.
    
    Reported and tested by: pho (previous version)
    Reviewed by:    markj
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D54945
---
 sys/kern/kern_synch.c |  2 +-
 sys/kern/sched_4bsd.c | 21 +++++++++++++++++++--
 sys/sys/proc.h        |  1 +
 sys/sys/sched.h       |  2 ++
 4 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index 8e956324ee23..fc6c9857463c 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -600,7 +600,7 @@ loadav(void *arg)
 	    loadav, NULL, C_DIRECT_EXEC | C_PREL(32));
 }
 
-static void
+void
 ast_scheduler(struct thread *td, int tda __unused)
 {
 #ifdef KTRACE
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index 317b47da2cca..a47fe3ce72c3 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -108,6 +108,8 @@ struct td_sched {
 #define TDF_BOUND	TDF_SCHED1	/* Bound to one CPU. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
+#define	TDP_RESCHED	TDP_SCHED1	/* Reschedule due to maybe_resched(). */
+
 /* flags kept in ts_flags */
 #define	TSF_AFFINITY	0x0001		/* Has a non-"full" CPU set. */
 
@@ -274,6 +276,17 @@ sched_load_rem(void)
 	KTR_COUNTER0(KTR_SCHED, "load", "global load", sched_tdcnt);
 	SDT_PROBE2(sched, , , load__change, NOCPU, sched_tdcnt);
 }
+
+static void
+maybe_resched_ast(struct thread *td, int tda)
+{
+	MPASS(td == curthread);		/* We are AST */
+	if ((td->td_pflags & TDP_RESCHED) != 0) {
+		td->td_pflags &= ~TDP_RESCHED;
+		ast_scheduler(td, tda);
+	}
+}
+
 /*
  * Arrange to reschedule if necessary, taking the priorities and
  * schedulers into account.
@@ -281,10 +294,12 @@ sched_load_rem(void)
 static void
 maybe_resched(struct thread *td)
 {
+	struct thread *ctd;
 
+	ctd = curthread;
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
-	if (td->td_priority < curthread->td_priority)
-		ast_sched_locked(curthread, TDA_SCHED);
+	if (td->td_priority < ctd->td_priority)
+		ctd->td_pflags |= TDP_RESCHED;
 }
 
 /*
@@ -621,6 +636,8 @@ sched_4bsd_setup(void)
 
 	/* Account for thread0. */
 	sched_load_add();
+
+	ast_register(TDA_SCHED_PRIV, ASTR_UNCOND, 0, maybe_resched_ast);
 }
 
 /*
diff --git a/sys/sys/proc.h b/sys/sys/proc.h
index 0c9658fff725..7c4431ab0819 100644
--- a/sys/sys/proc.h
+++ b/sys/sys/proc.h
@@ -503,6 +503,7 @@ enum {
 	TDA_SIGSUSPEND,
 	TDA_MOD3,		/* .. and after */
 	TDA_MOD4,
+	TDA_SCHED_PRIV,
 	TDA_MAX,
 };
 #define	TDAI(tda)		(1U << (tda))
diff --git a/sys/sys/sched.h b/sys/sys/sched.h
index 08d8636aaa68..c7b7b849947c 100644
--- a/sys/sys/sched.h
+++ b/sys/sys/sched.h
@@ -188,6 +188,8 @@ sched_unpin(void)
 	curthread->td_pinned--;
 }
 
+void ast_scheduler(struct thread *td, int tda);
+
 /* sched_add arguments (formerly setrunqueue) */
 #define	SRQ_BORING	0x0000		/* No special circumstances. */
 #define	SRQ_YIELDING	0x0001		/* We are yielding (from mi_switch). */