git: 1bd74d201a53 - main - jail: add kqueue(2) support for jails
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Thu, 04 Sep 2025 18:59:58 UTC
The branch main has been updated by jamie:
URL: https://cgit.FreeBSD.org/src/commit/?id=1bd74d201a534540614663686890ab96a3bbe2c7
commit 1bd74d201a534540614663686890ab96a3bbe2c7
Author: Jamie Gritton <jamie@FreeBSD.org>
AuthorDate: 2025-09-04 18:56:56 +0000
Commit: Jamie Gritton <jamie@FreeBSD.org>
CommitDate: 2025-09-04 18:56:56 +0000
jail: add kqueue(2) support for jails
Add kqueue tracking to jails, inspired by how it's done with processes.
EVFILT_JAIL takes a jail ID, and tracks with NOTE_JAIL_SET,
NOTE_JAIL_ATTACH, NOTE_JAIL_REMOVE, and NOTE_JAIL_CHILD. It also uses
the NOTE_TRACK mechanism that EVFILT_PROC uses, using the same result
flags (NOTE_CHILD and NOTE_TRACKERR).
Relnotes: yes
Differential Revision: https://reviews.freebsd.org/D51940
---
lib/libsys/kqueue.2 | 58 ++++++++++++++++++++-
sys/kern/kern_event.c | 139 ++++++++++++++++++++++++++++++++++++++++++++++++--
sys/kern/kern_jail.c | 64 ++++++++++++++++++++---
sys/sys/event.h | 19 +++++--
sys/sys/jail.h | 7 ++-
5 files changed, 270 insertions(+), 17 deletions(-)
diff --git a/lib/libsys/kqueue.2 b/lib/libsys/kqueue.2
index d6e949baa24c..e413f7d4fbca 100644
--- a/lib/libsys/kqueue.2
+++ b/lib/libsys/kqueue.2
@@ -22,7 +22,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd March 26, 2023
+.Dd September 4, 2025
.Dt KQUEUE 2
.Os
.Sh NAME
@@ -593,6 +593,62 @@ returns the number of times the signal has occurred since the last call to
This filter automatically sets the
.Dv EV_CLEAR
flag internally.
+.It Dv EVFILT_JAIL
+Takes the jail ID to monitor as the identifier and the events to watch for
+in
+.Va fflags ,
+and returns when the jail performs one or more of the requested events.
+If a process can normally see a jail, it can attach an event to it.
+An identifier of zero will watch the process's own jail.
+The events to monitor are:
+.Bl -tag -width "Dv NOTE_JAIL_ATTACH"
+.It Dv NOTE_JAIL_SET
+The jail has been changed via
+.Xr jail_set 2 .
+.It Dv NOTE_JAIL_ATTACH
+A process has attached to the jail via
+.Xr jail_attach 2
+or a similar call.
+The process ID will be stored in
+.Va data .
+If more than one process has attached since the last call to
+.Fn kevent ,
+.Va data
+will contain the most recently attached process ID,
+with
+.Dv NOTE_JAIL_ATTACH_MULTI
+set in
+.Va fflags .
+.It Dv NOTE_JAIL_REMOVE
+The jail has been removed.
+.It Dv NOTE_JAIL_CHILD
+A child of the watched jail has been created.
+.It Dv NOTE_TRACK
+Follow child jails created under this jail.
+Register a new kevent to monitor the child jail using the same
+.Va fflags
+as the original event.
+The child jail will signal an event with
+.Dv NOTE_CHILD
+set in
+.Va fflags
+and the parent JID in
+.Va data .
+.Pp
+If registering a new kevent fails
+.Pq usually due to resource limitations ,
+it will signal an event with
+.Dv NOTE_TRACKERR
+set in
+.Va fflags ,
+and the child jail will not signal a
+.Dv NOTE_CHILD
+event.
+.El
+.Pp
+On return,
+.Va fflags
+contains the events which triggered the filter.
.It Dv EVFILT_TIMER
Establishes an arbitrary timer identified by
.Va ident .
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index eb77a5064113..501adc151d44 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -50,6 +50,7 @@
#include <sys/filedesc.h>
#include <sys/filio.h>
#include <sys/fcntl.h>
+#include <sys/jail.h>
#include <sys/kthread.h>
#include <sys/selinfo.h>
#include <sys/queue.h>
@@ -163,6 +164,9 @@ static int filt_kqueue(struct knote *kn, long hint);
static int filt_procattach(struct knote *kn);
static void filt_procdetach(struct knote *kn);
static int filt_proc(struct knote *kn, long hint);
+static int filt_jailattach(struct knote *kn);
+static void filt_jaildetach(struct knote *kn);
+static int filt_jail(struct knote *kn, long hint);
static int filt_fileattach(struct knote *kn);
static void filt_timerexpire(void *knx);
static void filt_timerexpire_l(struct knote *kn, bool proc_locked);
@@ -195,6 +199,12 @@ static const struct filterops proc_filtops = {
.f_detach = filt_procdetach,
.f_event = filt_proc,
};
+static const struct filterops jail_filtops = {
+ .f_isfd = 0,
+ .f_attach = filt_jailattach,
+ .f_detach = filt_jaildetach,
+ .f_event = filt_jail,
+};
static const struct filterops timer_filtops = {
.f_isfd = 0,
.f_attach = filt_timerattach,
@@ -365,6 +375,7 @@ static struct {
[~EVFILT_USER] = { &user_filtops, 1 },
[~EVFILT_SENDFILE] = { &null_filtops },
[~EVFILT_EMPTY] = { &file_filtops, 1 },
+ [~EVFILT_JAIL] = { &jail_filtops, 1 },
};
/*
@@ -528,7 +539,8 @@ filt_proc(struct knote *kn, long hint)
* process forked. Additionally, for each knote attached to the
* parent, check whether user wants to track the new process. If so
* attach a new knote to it, and immediately report an event with the
- * child's pid.
+ * child's pid. This is also called on jail creation, which is treated
+ * the same way by jail events.
*/
void
knote_fork(struct knlist *list, int pid)
@@ -555,6 +567,8 @@ knote_fork(struct knlist *list, int pid)
/*
* The same as knote(), activate the event.
*/
+ _Static_assert(NOTE_JAIL_CHILD == NOTE_FORK,
+ "NOTE_JAIL_CHILD should be the same as NOTE_FORK");
if ((kn->kn_sfflags & NOTE_TRACK) == 0) {
if (kn->kn_fop->f_event(kn, NOTE_FORK))
KNOTE_ACTIVATE(kn, 1);
@@ -614,6 +628,124 @@ knote_fork(struct knlist *list, int pid)
}
}
+int
+filt_jailattach(struct knote *kn)
+{
+ struct prison *pr;
+ bool immediate;
+
+ immediate = false;
+ if (kn->kn_id == 0) {
+ /* Let jid=0 watch the current prison (including prison0). */
+ pr = curthread->td_ucred->cr_prison;
+ mtx_lock(&pr->pr_mtx);
+ } else if (kn->kn_flags & (EV_FLAG1 | EV_FLAG2)) {
+ /*
+ * The kernel registers prisons before they are valid,
+ * so prison_find_child will fail.
+ */
+ TAILQ_FOREACH(pr, &allprison, pr_list) {
+ if (pr->pr_id < kn->kn_id)
+ continue;
+ if (pr->pr_id > kn->kn_id) {
+ pr = NULL;
+ break;
+ }
+ mtx_lock(&pr->pr_mtx);
+ break;
+ }
+ if (pr == NULL)
+ return (ENOENT);
+ } else {
+ sx_slock(&allprison_lock);
+ pr = prison_find_child(curthread->td_ucred->cr_prison,
+ kn->kn_id);
+ sx_sunlock(&allprison_lock);
+ if (pr == NULL)
+ return (ENOENT);
+ if (!prison_isalive(pr)) {
+ mtx_unlock(&pr->pr_mtx);
+ return (ENOENT);
+ }
+ }
+ kn->kn_ptr.p_prison = pr;
+ kn->kn_flags |= EV_CLEAR;
+
+ /*
+ * Internal flag indicating registration done by kernel for the
+ * purposes of getting a NOTE_CHILD notification.
+ */
+ if (kn->kn_flags & EV_FLAG2) {
+ kn->kn_flags &= ~EV_FLAG2;
+ kn->kn_data = kn->kn_sdata; /* parent id */
+ kn->kn_fflags = NOTE_CHILD;
+ kn->kn_sfflags &= ~NOTE_JAIL_CTRLMASK;
+ immediate = true; /* Force immediate activation of child note. */
+ }
+ /*
+ * Internal flag indicating registration done by kernel (for other than
+ * NOTE_CHILD).
+ */
+ if (kn->kn_flags & EV_FLAG1) {
+ kn->kn_flags &= ~EV_FLAG1;
+ }
+
+ knlist_add(pr->pr_klist, kn, 1);
+
+ /* Immediately activate any child notes. */
+ if (immediate)
+ KNOTE_ACTIVATE(kn, 0);
+
+ mtx_unlock(&pr->pr_mtx);
+ return (0);
+}
+
+void
+filt_jaildetach(struct knote *kn)
+{
+ if (kn->kn_ptr.p_prison != NULL) {
+ knlist_remove(kn->kn_knlist, kn, 0);
+ kn->kn_ptr.p_prison = NULL;
+ } else
+ kn->kn_status |= KN_DETACHED;
+}
+
+int
+filt_jail(struct knote *kn, long hint)
+{
+ struct prison *pr;
+ u_int event;
+
+ pr = kn->kn_ptr.p_prison;
+ if (pr == NULL) /* already activated, from attach filter */
+ return (0);
+
+ /* Mask off extra data. */
+ event = (u_int)hint & NOTE_JAIL_CTRLMASK;
+
+ /* If the user is interested in this event, record it. */
+ if (kn->kn_sfflags & event)
+ kn->kn_fflags |= event;
+
+ /* Report the attached process id. */
+ if (event == NOTE_JAIL_ATTACH) {
+ if (kn->kn_data != 0)
+ kn->kn_fflags |= NOTE_JAIL_ATTACH_MULTI;
+ kn->kn_data = hint & NOTE_JAIL_DATAMASK;
+ }
+
+ /* Prison is gone, so flag the event as finished. */
+ if (event == NOTE_JAIL_REMOVE) {
+ kn->kn_flags |= EV_EOF | EV_ONESHOT;
+ kn->kn_ptr.p_prison = NULL;
+ if (kn->kn_fflags == 0)
+ kn->kn_flags |= EV_DROP;
+ return (1);
+ }
+
+ return (kn->kn_fflags != 0);
+}
+
/*
* XXX: EVFILT_TIMER should perhaps live in kern_time.c beside the
* interval timer support code.
@@ -1597,8 +1729,8 @@ findkn:
/*
* If possible, find an existing knote to use for this kevent.
*/
- if (kev->filter == EVFILT_PROC &&
- (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
+ if ((kev->filter == EVFILT_PROC || kev->filter == EVFILT_JAIL)
+ && (kev->flags & (EV_FLAG1 | EV_FLAG2)) != 0) {
/* This is an internal creation of a process tracking
* note. Don't attempt to coalesce this with an
* existing note.
@@ -2800,6 +2932,7 @@ knote_init(void)
knote_zone = uma_zcreate("KNOTE", sizeof(struct knote), NULL, NULL,
NULL, NULL, UMA_ALIGN_PTR, 0);
ast_register(TDA_KQUEUE, ASTR_ASTF_REQUIRED, 0, ast_kqueue);
+ prison0.pr_klist = knlist_alloc(&prison0.pr_mtx);
}
SYSINIT(knote, SI_SUB_PSEUDO, SI_ORDER_ANY, knote_init, NULL);
diff --git a/sys/kern/kern_jail.c b/sys/kern/kern_jail.c
index 7c9a15ae18f3..52210553016b 100644
--- a/sys/kern/kern_jail.c
+++ b/sys/kern/kern_jail.c
@@ -45,6 +45,7 @@
#include <sys/priv.h>
#include <sys/proc.h>
#include <sys/epoch.h>
+#include <sys/event.h>
#include <sys/taskqueue.h>
#include <sys/fcntl.h>
#include <sys/jail.h>
@@ -154,7 +155,8 @@ static void prison_complete(void *context, int pending);
static void prison_deref(struct prison *pr, int flags);
static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
static int prison_lock_xlock(struct prison *pr, int flags);
-static void prison_cleanup(struct prison *pr);
+static void prison_cleanup_locked(struct prison *pr);
+static void prison_cleanup_unlocked(struct prison *pr);
static void prison_free_not_last(struct prison *pr);
static void prison_proc_free_not_last(struct prison *pr);
static void prison_proc_relink(struct prison *opr, struct prison *npr,
@@ -167,6 +169,7 @@ static void prison_racct_attach(struct prison *pr);
static void prison_racct_modify(struct prison *pr);
static void prison_racct_detach(struct prison *pr);
#endif
+static void prison_knote(struct prison *pr, long hint);
/* Flags for prison_deref */
#define PD_DEREF 0x01 /* Decrement pr_ref */
@@ -1018,6 +1021,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
int ip6s;
bool redo_ip6;
#endif
+ bool maybe_changed;
uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
uint64_t pr_allow_diff;
unsigned tallow;
@@ -1422,6 +1426,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
pr = NULL;
inspr = NULL;
deadpr = NULL;
+ maybe_changed = false;
if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
namelc = strrchr(name, '.');
jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
@@ -1643,6 +1648,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
tpr->pr_childcount++;
+ pr->pr_klist = knlist_alloc(&pr->pr_mtx);
/* Set some default values, and inherit some from the parent. */
if (namelc == NULL)
@@ -1880,6 +1886,7 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
goto done_deref;
}
}
+ maybe_changed = true;
/* Set the parameters of the prison. */
#ifdef INET
@@ -2112,7 +2119,12 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
* reference via persistence, or is about to gain one via attachment.
*/
if (created) {
- drflags = prison_lock_xlock(pr, drflags);
+ sx_assert(&allprison_lock, SX_XLOCKED);
+ mtx_lock(&ppr->pr_mtx);
+ knote_fork(ppr->pr_klist, pr->pr_id);
+ mtx_unlock(&ppr->pr_mtx);
+ mtx_lock(&pr->pr_mtx);
+ drflags |= PD_LOCKED;
pr->pr_state = PRISON_STATE_ALIVE;
}
@@ -2150,6 +2162,13 @@ kern_jail_set(struct thread *td, struct uio *optuio, int flags)
td->td_retval[0] = pr->pr_id;
done_deref:
+ /*
+ * Report changes to kevent. This can happen even if the
+ * system call fails, as changes might have been made before
+ * the failure.
+ */
+ if (maybe_changed && !created)
+ prison_knote(pr, NOTE_JAIL_SET);
/* Release any temporary prison holds and/or locks. */
if (pr != NULL)
prison_deref(pr, drflags);
@@ -2755,6 +2774,7 @@ do_jail_attach(struct thread *td, struct prison *pr, int drflags)
prison_proc_relink(oldcred->cr_prison, pr, p);
prison_deref(oldcred->cr_prison, drflags);
crfree(oldcred);
+ prison_knote(pr, NOTE_JAIL_ATTACH | td->td_proc->p_pid);
/*
* If the prison was killed while changing credentials, die along
@@ -3182,9 +3202,10 @@ prison_deref(struct prison *pr, int flags)
refcount_load(&prison0.pr_uref) > 0,
("prison0 pr_uref=0"));
pr->pr_state = PRISON_STATE_DYING;
+ prison_cleanup_locked(pr);
mtx_unlock(&pr->pr_mtx);
flags &= ~PD_LOCKED;
- prison_cleanup(pr);
+ prison_cleanup_unlocked(pr);
}
}
}
@@ -3327,8 +3348,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
}
if (!(cpr->pr_flags & PR_REMOVE))
continue;
- prison_cleanup(cpr);
+ prison_cleanup_unlocked(cpr);
mtx_lock(&cpr->pr_mtx);
+ prison_cleanup_locked(cpr);
cpr->pr_flags &= ~PR_REMOVE;
if (cpr->pr_flags & PR_PERSIST) {
cpr->pr_flags &= ~PR_PERSIST;
@@ -3363,8 +3385,9 @@ prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
if (rpr != NULL)
LIST_REMOVE(rpr, pr_sibling);
- prison_cleanup(pr);
+ prison_cleanup_unlocked(pr);
mtx_lock(&pr->pr_mtx);
+ prison_cleanup_locked(pr);
if (pr->pr_flags & PR_PERSIST) {
pr->pr_flags &= ~PR_PERSIST;
prison_proc_free_not_last(pr);
@@ -3411,10 +3434,21 @@ prison_lock_xlock(struct prison *pr, int flags)
/*
* Release a prison's resources when it starts dying (when the last user
- * reference is dropped, or when it is killed).
+ * reference is dropped, or when it is killed). Two functions are called,
+ * for work that requires a locked prison or an unlocked one.
*/
static void
-prison_cleanup(struct prison *pr)
+prison_cleanup_locked(struct prison *pr)
+{
+ sx_assert(&allprison_lock, SA_XLOCKED);
+ mtx_assert(&pr->pr_mtx, MA_OWNED);
+ prison_knote(pr, NOTE_JAIL_REMOVE);
+ knlist_detach(pr->pr_klist);
+ pr->pr_klist = NULL;
+}
+
+static void
+prison_cleanup_unlocked(struct prison *pr)
{
sx_assert(&allprison_lock, SA_XLOCKED);
mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
@@ -5039,6 +5073,22 @@ prison_racct_detach(struct prison *pr)
}
#endif /* RACCT */
+/*
+ * Submit a knote for a prison, locking if necessary.
+ */
+static void
+prison_knote(struct prison *pr, long hint)
+{
+ int locked;
+
+ locked = mtx_owned(&pr->pr_mtx);
+ if (!locked)
+ mtx_lock(&pr->pr_mtx);
+ KNOTE_LOCKED(pr->pr_klist, hint);
+ if (!locked)
+ mtx_unlock(&pr->pr_mtx);
+}
+
#ifdef DDB
static void
diff --git a/sys/sys/event.h b/sys/sys/event.h
index 1b30e4292de8..f161d2c938c1 100644
--- a/sys/sys/event.h
+++ b/sys/sys/event.h
@@ -45,7 +45,8 @@
#define EVFILT_USER (-11) /* User events */
#define EVFILT_SENDFILE (-12) /* attached to sendfile requests */
#define EVFILT_EMPTY (-13) /* empty send socket buf */
-#define EVFILT_SYSCOUNT 13
+#define EVFILT_JAIL (-14) /* attached to struct prison */
+#define EVFILT_SYSCOUNT 14
#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
#define EV_SET(kevp_, a, b, c, d, e, f) do { \
@@ -204,10 +205,19 @@ struct freebsd11_kevent32 {
#define NOTE_PCTRLMASK 0xf0000000 /* mask for hint bits */
#define NOTE_PDATAMASK 0x000fffff /* mask for pid */
-/* additional flags for EVFILT_PROC */
-#define NOTE_TRACK 0x00000001 /* follow across forks */
+/* data/hint flags for EVFILT_JAIL */
+#define NOTE_JAIL_SET 0x80000000 /* jail was modified */
+#define NOTE_JAIL_CHILD 0x40000000 /* child jail was created */
+#define NOTE_JAIL_ATTACH 0x20000000 /* jail was attached to */
+#define NOTE_JAIL_REMOVE 0x10000000 /* jail was removed */
+#define NOTE_JAIL_ATTACH_MULTI 0x08000000 /* multiple procs attached */
+#define NOTE_JAIL_CTRLMASK 0xf0000000 /* mask for hint bits */
+#define NOTE_JAIL_DATAMASK 0x000fffff /* mask for pid */
+
+/* additional flags for EVFILT_PROC and EVFILT_JAIL */
+#define NOTE_TRACK 0x00000001 /* follow across fork/create */
#define NOTE_TRACKERR 0x00000002 /* could not track child */
-#define NOTE_CHILD 0x00000004 /* am a child process */
+#define NOTE_CHILD 0x00000004 /* am a child process/jail */
/* additional flags for EVFILT_TIMER */
#define NOTE_SECONDS 0x00000001 /* data is seconds */
@@ -309,6 +319,7 @@ struct knote {
struct proc *p_proc; /* proc pointer */
struct kaiocb *p_aio; /* AIO job pointer */
struct aioliojob *p_lio; /* LIO job pointer */
+ struct prison *p_prison; /* prison pointer */
void *p_v; /* generic other pointer */
} kn_ptr;
const struct filterops *kn_fop;
diff --git a/sys/sys/jail.h b/sys/sys/jail.h
index d2655c52e832..4512e43a2866 100644
--- a/sys/sys/jail.h
+++ b/sys/sys/jail.h
@@ -144,6 +144,7 @@ MALLOC_DECLARE(M_PRISON);
#define JAIL_META_PRIVATE "meta"
#define JAIL_META_SHARED "env"
+struct knlist;
struct racct;
struct prison_racct;
@@ -189,7 +190,8 @@ struct prison {
struct vnode *pr_root; /* (c) vnode to rdir */
struct prison_ip *pr_addrs[PR_FAMILY_MAX]; /* (p,n) IPs of jail */
struct prison_racct *pr_prison_racct; /* (c) racct jail proxy */
- void *pr_sparep[3];
+ struct knlist *pr_klist; /* (m) attached knotes */
+ void *pr_sparep[2];
int pr_childcount; /* (a) number of child jails */
int pr_childmax; /* (p) maximum child jails */
unsigned pr_allow; /* (p) PR_ALLOW_* flags */
@@ -425,10 +427,11 @@ SYSCTL_DECL(_security_jail_param);
/*
* Kernel support functions for jail().
*/
-struct ucred;
+struct knote;
struct mount;
struct sockaddr;
struct statfs;
+struct ucred;
struct vfsconf;
/*