git: 1a849ff1e9a9 - stable/15 - jail: simplify EVFILT_JAIL events

From: Jamie Gritton <jamie_at_FreeBSD.org>
Date: Mon, 15 Sep 2025 04:04:16 UTC
The branch stable/15 has been updated by jamie:

URL: https://cgit.FreeBSD.org/src/commit/?id=1a849ff1e9a93f2d0f66c962a17b20af13796f01

commit 1a849ff1e9a93f2d0f66c962a17b20af13796f01
Author:     Jamie Gritton <jamie@FreeBSD.org>
AuthorDate: 2025-09-12 05:22:45 +0000
Commit:     Jamie Gritton <jamie@FreeBSD.org>
CommitDate: 2025-09-15 03:33:18 +0000

    jail: simplify EVFILT_JAIL events
    
    Instead of using the EVFILT_PROC model of attempting to automatically
    register new events when a child jail is created, just give a single
    event when a child jail is created.  As was already done with jail
    attach events, make a best-effort report of the added jail's id in
    kn_data.  If the are multiple NOTE_JAIL_CHILD and/or NOTE_JAIL_ATTACH
    events, set the NOTE_JAIL_MULTI flag, and don't report anything in
    data, indicating that the caller will need to query the system state
    on their own.
    
    (cherry picked from commit dbcaac13e49c88d1c077f34f56dd2b7ba77a145a)
---
 lib/libsys/kqueue.2   | 41 +++++++++++------------------
 sys/kern/kern_event.c | 71 +++++++++++----------------------------------------
 sys/kern/kern_jail.c  |  4 +--
 sys/sys/event.h       | 17 ++++++------
 4 files changed, 39 insertions(+), 94 deletions(-)

diff --git a/lib/libsys/kqueue.2 b/lib/libsys/kqueue.2
index e413f7d4fbca..aafb5317c5e0 100644
--- a/lib/libsys/kqueue.2
+++ b/lib/libsys/kqueue.2
@@ -22,7 +22,7 @@
 .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 .\" SUCH DAMAGE.
 .\"
-.Dd September 4, 2025
+.Dd September 11, 2025
 .Dt KQUEUE 2
 .Os
 .Sh NAME
@@ -614,41 +614,30 @@ The process ID will be stored in
 If more than one process has attached since the last call to
 .Fn kevent ,
 .Va data
-will contain the most recently attached process ID,
-with
-.Dv NOTE_JAIL_ATTACH_MULTI
-set in
-.Va fflags .
+will be zero.
 .It Dv NOTE_JAIL_REMOVE
 The jail has been removed.
 .It Dv NOTE_JAIL_CHILD
 A child of the watched jail has been created.
-.It Dv NOTE_TRACK
-Follow child jails created under this jail.
-Register a new kevent to monitor the child jail using the same
-.Va fflags
-as the original event.
-The child jail will signal an event with
-.Dv NOTE_CHILD
-set in
-.Va fflags
-and the parent JID in
+Its jail ID will be stored in
 .Va data .
-.Pp
-If registering a new kevent fails
-.Pq usually due to resource limitations ,
-it will signal an event with
-.Dv NOTE_TRACKERR
-set in
-.Va fflags ,
-and the child jail will not signal a
-.Dv NOTE_CHILD
-event.
+If more than one jail has been created since the last call to
+.Fn kevent ,
+.Va data
+will be zero.
 .El
 .Pp
 On return,
 .Va fflags
 contains the events which triggered the filter.
+It will also contain
+.Dv NOTE_JAIL_MULTI
+if more than one
+.Dv NOTE_JAIL_ATTACH
+or
+.Dv NOTE_JAIL_CHILD
+event has been received since the last call to
+.Fn kevent .
 .It Dv EVFILT_TIMER
 Establishes an arbitrary timer identified by
 .Va ident .
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index 501adc151d44..8d1ff313735b 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -539,8 +539,7 @@ filt_proc(struct knote *kn, long hint)
  * process forked. Additionally, for each knote attached to the
  * parent, check whether user wants to track the new process. If so
  * attach a new knote to it, and immediately report an event with the
- * child's pid. This is also called on jail creation, which is treated
- * the same way by jail events.
+ * child's pid.
  */
 void
 knote_fork(struct knlist *list, int pid)
@@ -567,8 +566,6 @@ knote_fork(struct knlist *list, int pid)
 		/*
 		 * The same as knote(), activate the event.
 		 */
-		_Static_assert(NOTE_JAIL_CHILD == NOTE_FORK,
-		    "NOTE_JAIL_CHILD should be the same as NOTE_FORK");
 		if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
 			if (kn->kn_fop->f_event(kn, NOTE_FORK))
 				KNOTE_ACTIVATE(kn, 1);
@@ -632,30 +629,11 @@ int
 filt_jailattach(struct knote *kn)
 {
 	struct prison *pr;
-	bool immediate;
 
-	immediate = false;
 	if (kn->kn_id == 0) {
 		/* Let jid=0 watch the current prison (including prison0). */
 		pr = curthread->td_ucred->cr_prison;
 		mtx_lock(&pr->pr_mtx);
-	} else if (kn->kn_flags & (EV_FLAG1 | EV_FLAG2)) {
-		/*
-		 * The kernel registers prisons before they are valid,
-		 * so prison_find_child will fail.
-		 */
-		TAILQ_FOREACH(pr, &allprison, pr_list) {
-			if (pr->pr_id < kn->kn_id)
-				continue;
-			if (pr->pr_id > kn->kn_id) {
-				pr = NULL;
-				break;
-			}
-			mtx_lock(&pr->pr_mtx);
-			break;
-		}
-		if (pr == NULL)
-			return (ENOENT);
 	} else {
 		sx_slock(&allprison_lock);
 		pr = prison_find_child(curthread->td_ucred->cr_prison,
@@ -670,32 +648,7 @@ filt_jailattach(struct knote *kn)
 	}
 	kn->kn_ptr.p_prison = pr;
 	kn->kn_flags |= EV_CLEAR;
-
-	/*
-	 * Internal flag indicating registration done by kernel for the
-	 * purposes of getting a NOTE_CHILD notification.
-	 */
-	if (kn->kn_flags & EV_FLAG2) {
-		kn->kn_flags &= ~EV_FLAG2;
-		kn->kn_data = kn->kn_sdata;		/* parent id */
-		kn->kn_fflags = NOTE_CHILD;
-		kn->kn_sfflags &= ~NOTE_JAIL_CTRLMASK;
-		immediate = true; /* Force immediate activation of child note. */
-	}
-	/*
-	 * Internal flag indicating registration done by kernel (for other than
-	 * NOTE_CHILD).
-	 */
-	if (kn->kn_flags & EV_FLAG1) {
-		kn->kn_flags &= ~EV_FLAG1;
-	}
-
 	knlist_add(pr->pr_klist, kn, 1);
-
-	/* Immediately activate any child notes. */
-	if (immediate)
-		KNOTE_ACTIVATE(kn, 0);
-
 	mtx_unlock(&pr->pr_mtx);
 	return (0);
 }
@@ -720,18 +673,24 @@ filt_jail(struct knote *kn, long hint)
 	if (pr == NULL) /* already activated, from attach filter */
 		return (0);
 
-	/* Mask off extra data. */
-	event = (u_int)hint & NOTE_JAIL_CTRLMASK;
+	/*
+	 * Mask off extra data.  In the NOTE_JAIL_CHILD case, that's
+	 * everything except the NOTE_JAIL_CHILD bit itself, since a
+	 * JID is any positive integer.
+	 */
+	event = ((u_int)hint & NOTE_JAIL_CHILD) ? NOTE_JAIL_CHILD :
+	    (u_int)hint & NOTE_JAIL_CTRLMASK;
 
 	/* If the user is interested in this event, record it. */
 	if (kn->kn_sfflags & event)
 		kn->kn_fflags |= event;
 
-	/* Report the attached process id. */
-	if (event == NOTE_JAIL_ATTACH) {
+	/* Report the created jail id or attached process id. */
+	if (event == NOTE_JAIL_CHILD || event == NOTE_JAIL_ATTACH) {
 		if (kn->kn_data != 0)
-			kn->kn_fflags |= NOTE_JAIL_ATTACH_MULTI;
-		kn->kn_data = hint & NOTE_JAIL_DATAMASK;
+			kn->kn_fflags |= NOTE_JAIL_MULTI;
+		kn->kn_data = (kn->kn_fflags & NOTE_JAIL_MULTI) ? 0U :
+		    (u_int)hint & ~event;
 	}
 
 	/* Prison is gone, so flag the event as finished. */
@@ -1729,8 +1688,8 @@ findkn:
 		/*
 		 * If possible, find an existing knote to use for this kevent.
 		 */
-		if ((kev->filter == EVFILT_PROC || kev->filter == EVFILT_JAIL)
-		    && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
+		if (kev->filter == EVFILT_PROC &&
+		    (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
 			/* This is an internal creation of a process tracking
 			 * note. Don't attempt to coalesce this with an
 			 * existing note.
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 3d18b03119ff..d90ccf4a04c8 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -2221,9 +2221,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
 	 */
 	if (created) {
 		sx_assert(&allprison_lock, SX_XLOCKED);
-		mtx_lock(&ppr->pr_mtx);
-		knote_fork(ppr->pr_klist, pr->pr_id);
-		mtx_unlock(&ppr->pr_mtx);
+		prison_knote(ppr, NOTE_JAIL_CHILD | pr->pr_id);
 		mtx_lock(&pr->pr_mtx);
 		drflags |= PD_LOCKED;
 		pr->pr_state = PRISON_STATE_ALIVE;
diff --git a/sys/sys/event.h b/sys/sys/event.h
index f161d2c938c1..91fbaa4834f7 100644
--- a/sys/sys/event.h
+++ b/sys/sys/event.h
@@ -205,19 +205,18 @@ struct freebsd11_kevent32 {
 #define	NOTE_PCTRLMASK	0xf0000000		/* mask for hint bits */
 #define	NOTE_PDATAMASK	0x000fffff		/* mask for pid */
 
+/* additional flags for EVFILT_PROC */
+#define	NOTE_TRACK	0x00000001		/* follow across fork/create */
+#define	NOTE_TRACKERR	0x00000002		/* could not track child */
+#define	NOTE_CHILD	0x00000004		/* am a child process */
+
 /* data/hint flags for EVFILT_JAIL */
-#define	NOTE_JAIL_SET		0x80000000	/* jail was modified */
-#define	NOTE_JAIL_CHILD		0x40000000	/* child jail was created */
+#define	NOTE_JAIL_CHILD		0x80000000	/* child jail was created */
+#define	NOTE_JAIL_SET		0x40000000	/* jail was modified */
 #define	NOTE_JAIL_ATTACH	0x20000000	/* jail was attached to */
 #define	NOTE_JAIL_REMOVE	0x10000000	/* jail was removed */
-#define NOTE_JAIL_ATTACH_MULTI	0x08000000	/* multiple procs attached */
+#define NOTE_JAIL_MULTI		0x08000000	/* multiple child or attach */
 #define	NOTE_JAIL_CTRLMASK	0xf0000000	/* mask for hint bits */
-#define	NOTE_JAIL_DATAMASK	0x000fffff	/* mask for pid */
-
-/* additional flags for EVFILT_PROC and EVFILT_JAIL */
-#define	NOTE_TRACK	0x00000001		/* follow across fork/create */
-#define	NOTE_TRACKERR	0x00000002		/* could not track child */
-#define	NOTE_CHILD	0x00000004		/* am a child process/jail */
 
 /* additional flags for EVFILT_TIMER */
 #define NOTE_SECONDS		0x00000001	/* data is seconds */