git: b11289f87123 - main - kqueuex(2): add KQUEUE_CPONFORK

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Sat, 18 Oct 2025 05:14:14 UTC
The branch main has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=b11289f87123f8ae06fc70bc70d26a25d4356a65

commit b11289f87123f8ae06fc70bc70d26a25d4356a65
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2025-08-19 04:34:04 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2025-10-18 05:12:36 +0000

    kqueuex(2): add KQUEUE_CPONFORK
    
    The created kqueue is copied on fork, together with the registered
    events. This means that a new kqueue is created at the same fd index
    as the parent' kqueue, and all registered events are copied into the
    new kqueue (when possible). The current active events list is also
    duplicated.
    
    Reviewed by:    markj
    Tested by:      pho
    Sponsored by:   The FreeBSD Foundation
    MFC after:      2 weeks
    Differential revision:  https://reviews.freebsd.org/D52045
---
 sys/compat/linux/linux_event.c |   2 +-
 sys/kern/kern_event.c          | 174 ++++++++++++++++++++++++++++++++++++++---
 sys/sys/event.h                |   2 +
 sys/sys/eventvar.h             |   2 +
 sys/sys/syscallsubr.h          |   3 +-
 5 files changed, 168 insertions(+), 15 deletions(-)

diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c
index e88791659f1f..fc3ef7c3e90a 100644
--- a/sys/compat/linux/linux_event.c
+++ b/sys/compat/linux/linux_event.c
@@ -104,7 +104,7 @@ static int
 epoll_create_common(struct thread *td, int flags)
 {
 
-	return (kern_kqueue(td, flags, NULL));
+	return (kern_kqueue(td, flags, false, NULL));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index 80c289f7d802..1f3030995ec6 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -134,6 +134,7 @@ static fo_kqfilter_t	kqueue_kqfilter;
 static fo_stat_t	kqueue_stat;
 static fo_close_t	kqueue_close;
 static fo_fill_kinfo_t	kqueue_fill_kinfo;
+static fo_fork_t	kqueue_fork;
 
 static const struct fileops kqueueops = {
 	.fo_read = invfo_rdwr,
@@ -148,7 +149,9 @@ static const struct fileops kqueueops = {
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_cmp = file_kcmp_generic,
+	.fo_fork = kqueue_fork,
 	.fo_fill_kinfo = kqueue_fill_kinfo,
+	.fo_flags = DFLAG_FORK,
 };
 
 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
@@ -1151,7 +1154,7 @@ int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
 
-	return (kern_kqueue(td, 0, NULL));
+	return (kern_kqueue(td, 0, false, NULL));
 }
 
 int
@@ -1159,27 +1162,30 @@ sys_kqueuex(struct thread *td, struct kqueuex_args *uap)
 {
 	int flags;
 
-	if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0)
+	if ((uap->flags & ~(KQUEUE_CLOEXEC | KQUEUE_CPONFORK)) != 0)
 		return (EINVAL);
 	flags = 0;
 	if ((uap->flags & KQUEUE_CLOEXEC) != 0)
 		flags |= O_CLOEXEC;
-	return (kern_kqueue(td, flags, NULL));
+	return (kern_kqueue(td, flags, (uap->flags & KQUEUE_CPONFORK) != 0,
+	    NULL));
 }
 
 static void
-kqueue_init(struct kqueue *kq)
+kqueue_init(struct kqueue *kq, bool cponfork)
 {
 
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
+	if (cponfork)
+		kq->kq_state |= KQ_CPONFORK;
 }
 
 static int
 kern_kqueue_alloc(struct thread *td, struct filedesc *fdp, int *fdip,
-    struct file **fpp, int flags, struct filecaps *fcaps,
+    struct file **fpp, int flags, struct filecaps *fcaps, bool cponfork,
     struct kqueue **kqp)
 {
 	struct ucred *cred;
@@ -1191,7 +1197,7 @@ kern_kqueue_alloc(struct thread *td, struct filedesc *fdp, int *fdip,
 		return (ENOMEM);
 
 	error = fdip != NULL ? falloc_caps(td, fpp, fdip, flags, fcaps) :
-	    _falloc_noinstall(td, fpp, 2);
+	    _falloc_noinstall(td, fpp, 1);
 	if (error != 0) {
 		chgkqcnt(cred->cr_ruidinfo, -1, 0);
 		return (error);
@@ -1199,31 +1205,33 @@ kern_kqueue_alloc(struct thread *td, struct filedesc *fdp, int *fdip,
 
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	kq = malloc(sizeof(*kq), M_KQUEUE, M_WAITOK | M_ZERO);
-	kqueue_init(kq);
+	kqueue_init(kq, cponfork);
 	kq->kq_fdp = fdp;
 	kq->kq_cred = crhold(cred);
 
-	FILEDESC_XLOCK(fdp);
+	if (fdip != NULL)
+		FILEDESC_XLOCK(fdp);
 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
-	FILEDESC_XUNLOCK(fdp);
+	if (fdip != NULL)
+		FILEDESC_XUNLOCK(fdp);
 
+	finit(*fpp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
 	*kqp = kq;
 	return (0);
 }
 
 int
-kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
+kern_kqueue(struct thread *td, int flags, bool cponfork, struct filecaps *fcaps)
 {
 	struct kqueue *kq;
 	struct file *fp;
 	int fd, error;
 
 	error = kern_kqueue_alloc(td, td->td_proc->p_fd, &fd, &fp, flags,
-	    fcaps, &kq);
+	    fcaps, cponfork, &kq);
 	if (error != 0)
 		return (error);
 
-	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
@@ -1504,7 +1512,7 @@ kern_kevent_anonymous(struct thread *td, int nevents,
 	struct kqueue kq = {};
 	int error;
 
-	kqueue_init(&kq);
+	kqueue_init(&kq, false);
 	kq.kq_refcnt = 1;
 	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
 	kqueue_drain(&kq, td);
@@ -2958,6 +2966,146 @@ noacquire:
 	return (error);
 }
 
+static int
+kqueue_fork_alloc(struct filedesc *fdp, struct file *fp, struct file **fp1,
+    struct thread *td)
+{
+	struct kqueue *kq, *kq1;
+	int error;
+
+	MPASS(fp->f_type == DTYPE_KQUEUE);
+	kq = fp->f_data;
+	if ((kq->kq_state & KQ_CPONFORK) == 0)
+		return (EOPNOTSUPP);
+	error = kqueue_acquire_ref(kq);
+	if (error != 0)
+		return (error);
+	error = kern_kqueue_alloc(td, fdp, NULL, fp1, 0, NULL, true, &kq1);
+	if (error == 0) {
+		kq1->kq_forksrc = kq;
+		(*fp1)->f_flag = fp->f_flag & (FREAD | FWRITE | FEXEC |
+		    O_CLOEXEC | O_CLOFORK);
+	} else {
+		kqueue_release(kq, 0);
+	}
+	return (error);
+}
+
+static void
+kqueue_fork_copy_knote(struct kqueue *kq1, struct knote *kn, struct proc *p1,
+    struct filedesc *fdp)
+{
+	struct knote *kn1;
+	const struct filterops *fop;
+	int error;
+
+	fop = kn->kn_fop;
+	if (fop->f_copy == NULL || (fop->f_isfd &&
+	    fdp->fd_files->fdt_ofiles[kn->kn_kevent.ident].fde_file == NULL))
+		return;
+	error = kqueue_expand(kq1, fop, kn->kn_kevent.ident, M_WAITOK);
+	if (error != 0)
+		return;
+
+	kn1 = knote_alloc(M_WAITOK);
+	*kn1 = *kn;
+	kn1->kn_status |= KN_DETACHED;
+	kn1->kn_status &= ~KN_QUEUED;
+	kn1->kn_kq = kq1;
+	error = fop->f_copy(kn1, p1);
+	if (error != 0) {
+		knote_free(kn1);
+		return;
+	}
+	(void)kqueue_fo_find(kn->kn_kevent.filter);
+	if (fop->f_isfd && !fhold(kn1->kn_fp)) {
+		fop->f_detach(kn1);
+		kqueue_fo_release(kn->kn_kevent.filter);
+		knote_free(kn1);
+		return;
+	}
+	if (kn->kn_knlist != NULL)
+		knlist_add(kn->kn_knlist, kn1, 0);
+	KQ_LOCK(kq1);
+	knote_attach(kn1, kq1);
+	kn1->kn_influx = 0;
+	if ((kn->kn_status & KN_QUEUED) != 0)
+		knote_enqueue(kn1);
+	KQ_UNLOCK(kq1);
+}
+
+static void
+kqueue_fork_copy_list(struct klist *knlist, struct knote *marker,
+    struct kqueue *kq, struct kqueue *kq1, struct proc *p1,
+    struct filedesc *fdp)
+{
+	struct knote *kn;
+
+	KQ_OWNED(kq);
+	kn = SLIST_FIRST(knlist);
+	while (kn != NULL) {
+		if ((kn->kn_status & KN_DETACHED) != 0 ||
+		    (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0)) {
+			kn = SLIST_NEXT(kn, kn_link);
+			continue;
+		}
+		kn_enter_flux(kn);
+		SLIST_INSERT_AFTER(kn, marker, kn_link);
+		KQ_UNLOCK(kq);
+		kqueue_fork_copy_knote(kq1, kn, p1, fdp);
+		KQ_LOCK(kq);
+		kn_leave_flux(kn);
+		kn = SLIST_NEXT(marker, kn_link);
+		/* XXXKIB switch kn_link to LIST? */
+		SLIST_REMOVE(knlist, marker, knote, kn_link);
+	}
+}
+
+static int
+kqueue_fork_copy(struct filedesc *fdp, struct file *fp, struct file *fp1,
+    struct proc *p1, struct thread *td)
+{
+	struct kqueue *kq, *kq1;
+	struct knote *marker;
+	int error, i;
+
+	error = 0;
+	MPASS(fp == NULL);
+	MPASS(fp1->f_type == DTYPE_KQUEUE);
+
+	kq1 = fp1->f_data;
+	kq = kq1->kq_forksrc;
+	marker = knote_alloc(M_WAITOK);
+	marker->kn_status = KN_MARKER;
+
+	KQ_LOCK(kq);
+	for (i = 0; i < kq->kq_knlistsize; i++) {
+		kqueue_fork_copy_list(&kq->kq_knlist[i], marker, kq, kq1,
+		    p1, fdp);
+	}
+	if (kq->kq_knhashmask != 0) {
+		for (i = 0; i <= kq->kq_knhashmask; i++) {
+			kqueue_fork_copy_list(&kq->kq_knhash[i], marker, kq,
+			    kq1, p1, fdp);
+		}
+	}
+	kqueue_release(kq, 1);
+	kq1->kq_forksrc = NULL;
+	KQ_UNLOCK(kq);
+
+	knote_free(marker);
+	return (error);
+}
+
+static int
+kqueue_fork(struct filedesc *fdp, struct file *fp, struct file **fp1,
+    struct proc *p1, struct thread *td)
+{
+	if (*fp1 == NULL)
+		return (kqueue_fork_alloc(fdp, fp, fp1, td));
+	return (kqueue_fork_copy(fdp, fp, *fp1, p1, td));
+}
+
 struct knote_status_export_bit {
 	int kn_status_bit;
 	int knt_status_bit;
diff --git a/sys/sys/event.h b/sys/sys/event.h
index 084eaafcbdc0..6e71445f03b0 100644
--- a/sys/sys/event.h
+++ b/sys/sys/event.h
@@ -228,6 +228,7 @@ struct freebsd11_kevent32 {
 
 /* Flags for kqueuex(2) */
 #define	KQUEUE_CLOEXEC	0x00000001	/* close on exec */
+#define	KQUEUE_CPONFORK	0x00000002	/* copy on fork */
 
 struct knote;
 SLIST_HEAD(klist, knote);
@@ -283,6 +284,7 @@ struct filterops {
 	void	(*f_touch)(struct knote *kn, struct kevent *kev, u_long type);
 	int	(*f_userdump)(struct proc *p, struct knote *kn,
 		    struct kinfo_knote *kin);
+	int	(*f_copy)(struct knote *kn, struct proc *p1);
 };
 
 /*
diff --git a/sys/sys/eventvar.h b/sys/sys/eventvar.h
index 7fec444447f9..7cb3269f1fdf 100644
--- a/sys/sys/eventvar.h
+++ b/sys/sys/eventvar.h
@@ -55,12 +55,14 @@ struct kqueue {
 #define KQ_CLOSING	0x10
 #define	KQ_TASKSCHED	0x20			/* task scheduled */
 #define	KQ_TASKDRAIN	0x40			/* waiting for task to drain */
+#define	KQ_CPONFORK	0x80
 	int		kq_knlistsize;		/* size of knlist */
 	struct		klist *kq_knlist;	/* list of knotes */
 	u_long		kq_knhashmask;		/* size of knhash */
 	struct		klist *kq_knhash;	/* hash table for knotes */
 	struct		task kq_task;
 	struct		ucred *kq_cred;
+	struct		kqueue *kq_forksrc;
 };
 
 #endif /* !_SYS_EVENTVAR_H_ */
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
index 8237165b84ce..d32690634059 100644
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -211,7 +211,8 @@ int	kern_kevent_fp(struct thread *td, struct file *fp, int nchanges,
 	    int nevents, struct kevent_copyops *k_ops,
 	    const struct timespec *timeout);
 int	kern_kill(struct thread *td, pid_t pid, int signum);
-int	kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps);
+int	kern_kqueue(struct thread *td, int flags, bool cponfork,
+	    struct filecaps *fcaps);
 int	kern_kldload(struct thread *td, const char *file, int *fileid);
 int	kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat);
 int	kern_kldunload(struct thread *td, int fileid, int flags);