git: 245157fd8a38 - main - ktrcsw(): should not be called when the thread is owning interlock or on sleepq

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Fri, 30 Jan 2026 18:46:03 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=245157fd8a382c3989075789ee98582665f3b31d

commit 245157fd8a382c3989075789ee98582665f3b31d
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2026-01-22 21:46:01 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2026-01-30 18:45:48 +0000

    ktrcsw(): should not be called when the thread is owning interlock or on sleepq
    
    The issue is that for ktrcsw() we lock the ktrace_mtx mutex while owning
    the interlock from a subsystem that called msleep().  In particular, the
    VM subsystem might call msleep() if page allocation failed.  This
    establishes order VM locks (e.g. domain free queue lock) -> ktrace_mtx.
    Calling free() while owning ktrace_mtx gives the reverse order.
    
    Worse, msleep_spin_sbt() call s ktrcsw() while the thread is put on
    sleep queue.  Then, since the mutex might be contested, the thread needs
    to be put on turnstil, which cannot work.
    
    Move the ktrcsw() call for switch-out after the wakeup, when the thread
    does not yet re-obtained any locks.  From there, we call a special
    version of ktrcsw(), which is passed the actual time when the context
    switch occured.
    
    The drawback is that the switch-out record is only written in the
    ktrace.out file after the switch-in occurred, but this is probably not
    too serious.
    
    Reported and tested by: pho
    Reviewed by:    markj
    Sponsored by:   The FreeBSD Foundation
    MFC after:      1 week
    Differential revision:  https://reviews.freebsd.org/D54831
---
 sys/kern/kern_ktrace.c | 18 ++++++++++++++++--
 sys/kern/kern_synch.c  | 26 +++++++++++---------------
 sys/sys/ktrace.h       |  1 +
 3 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
index b58e69a3f38e..f3ee1c53fafd 100644
--- a/sys/kern/kern_ktrace.c
+++ b/sys/kern/kern_ktrace.c
@@ -838,8 +838,8 @@ ktrpsig(int sig, sig_t action, sigset_t *mask, int code)
 	ktrace_exit(td);
 }
 
-void
-ktrcsw(int out, int user, const char *wmesg)
+static void
+ktrcsw_impl(int out, int user, const char *wmesg, const struct timespec *tv)
 {
 	struct thread *td = curthread;
 	struct ktr_request *req;
@@ -854,6 +854,8 @@ ktrcsw(int out, int user, const char *wmesg)
 	kc = &req->ktr_data.ktr_csw;
 	kc->out = out;
 	kc->user = user;
+	if (tv != NULL)
+		req->ktr_header.ktr_time = *tv;
 	if (wmesg != NULL)
 		strlcpy(kc->wmesg, wmesg, sizeof(kc->wmesg));
 	else
@@ -862,6 +864,18 @@ ktrcsw(int out, int user, const char *wmesg)
 	ktrace_exit(td);
 }
 
+void
+ktrcsw(int out, int user, const char *wmesg)
+{
+	ktrcsw_impl(out, user, wmesg, NULL);
+}
+
+void
+ktrcsw_out(const struct timespec *tv, const char *wmesg)
+{
+	ktrcsw_impl(1, 0, wmesg, tv);
+}
+
 void
 ktrstruct(const char *name, const void *data, size_t datalen)
 {
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index fc6c9857463c..22628c78d8a5 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -133,6 +133,7 @@ _sleep(const void *ident, struct lock_object *lock, int priority,
 {
 	struct thread *td __ktrace_used;
 	struct lock_class *class;
+	struct timespec sw_out_tv __ktrace_used;
 	uintptr_t lock_state;
 	int catch, pri, rval, sleepq_flags;
 	WITNESS_SAVE_DECL(lock_witness);
@@ -141,7 +142,7 @@ _sleep(const void *ident, struct lock_object *lock, int priority,
 	td = curthread;
 #ifdef KTRACE
 	if (KTRPOINT(td, KTR_CSW))
-		ktrcsw(1, 0, wmesg);
+		nanotime(&sw_out_tv);
 #endif
 	WITNESS_WARN(WARN_GIANTOK | WARN_SLEEPOK, lock,
 	    "Sleeping on \"%s\"", wmesg);
@@ -222,8 +223,10 @@ _sleep(const void *ident, struct lock_object *lock, int priority,
 		rval = 0;
 	}
 #ifdef KTRACE
-	if (KTRPOINT(td, KTR_CSW))
+	if (KTRPOINT(td, KTR_CSW)) {
+		ktrcsw_out(&sw_out_tv, wmesg);
 		ktrcsw(0, 0, wmesg);
+	}
 #endif
 	PICKUP_GIANT();
 	if (lock != NULL && lock != &Giant.lock_object && !(priority & PDROP)) {
@@ -239,6 +242,7 @@ msleep_spin_sbt(const void *ident, struct mtx *mtx, const char *wmesg,
     sbintime_t sbt, sbintime_t pr, int flags)
 {
 	struct thread *td __ktrace_used;
+	struct timespec sw_out_tv __ktrace_used;
 	int rval;
 	WITNESS_SAVE_DECL(mtx);
 
@@ -266,19 +270,9 @@ msleep_spin_sbt(const void *ident, struct mtx *mtx, const char *wmesg,
 	if (sbt != 0)
 		sleepq_set_timeout_sbt(ident, sbt, pr, flags);
 
-	/*
-	 * Can't call ktrace with any spin locks held so it can lock the
-	 * ktrace_mtx lock, and WITNESS_WARN considers it an error to hold
-	 * any spin lock.  Thus, we have to drop the sleepq spin lock while
-	 * we handle those requests.  This is safe since we have placed our
-	 * thread on the sleep queue already.
-	 */
 #ifdef KTRACE
-	if (KTRPOINT(td, KTR_CSW)) {
-		sleepq_release(ident);
-		ktrcsw(1, 0, wmesg);
-		sleepq_lock(ident);
-	}
+	if (KTRPOINT(td, KTR_CSW))
+		nanotime(&sw_out_tv);
 #endif
 #ifdef WITNESS
 	sleepq_release(ident);
@@ -293,8 +287,10 @@ msleep_spin_sbt(const void *ident, struct mtx *mtx, const char *wmesg,
 		rval = 0;
 	}
 #ifdef KTRACE
-	if (KTRPOINT(td, KTR_CSW))
+	if (KTRPOINT(td, KTR_CSW)) {
+		ktrcsw_out(&sw_out_tv, wmesg);
 		ktrcsw(0, 0, wmesg);
+	}
 #endif
 	PICKUP_GIANT();
 	mtx_lock_spin(mtx);
diff --git a/sys/sys/ktrace.h b/sys/sys/ktrace.h
index 822822a6ffe7..1ba5c84000b0 100644
--- a/sys/sys/ktrace.h
+++ b/sys/sys/ktrace.h
@@ -340,6 +340,7 @@ ktr_get_tracevp(struct proc *p, bool ref)
 void	ktr_io_params_free(struct ktr_io_params *);
 void	ktrnamei(const char *);
 void	ktrcsw(int, int, const char *);
+void	ktrcsw_out(const struct timespec *, const char *);
 void	ktrpsig(int, sig_t, sigset_t *, int);
 void	ktrfault(vm_offset_t, int);
 void	ktrfaultend(int);