git: 2002e0712320 - stable/15 - kqueuex(2): add KQUEUE_CPONFORK

From: Konstantin Belousov <kib_at_FreeBSD.org>
Date: Sat, 01 Nov 2025 00:45:07 UTC
The branch stable/15 has been updated by kib:

URL: https://cgit.FreeBSD.org/src/commit/?id=2002e07123202a0b86f7220a78786eec71c61070

commit 2002e07123202a0b86f7220a78786eec71c61070
Author:     Konstantin Belousov <kib@FreeBSD.org>
AuthorDate: 2025-08-19 04:34:04 +0000
Commit:     Konstantin Belousov <kib@FreeBSD.org>
CommitDate: 2025-11-01 00:43:14 +0000

    kqueuex(2): add KQUEUE_CPONFORK
    
    (cherry picked from commit b11289f87123f8ae06fc70bc70d26a25d4356a65)
---
 sys/compat/linux/linux_event.c |   2 +-
 sys/kern/kern_event.c          | 174 ++++++++++++++++++++++++++++++++++++++---
 sys/sys/event.h                |   2 +
 sys/sys/eventvar.h             |   2 +
 sys/sys/syscallsubr.h          |   3 +-
 5 files changed, 168 insertions(+), 15 deletions(-)

diff --git a/sys/compat/linux/linux_event.c b/sys/compat/linux/linux_event.c
index e88791659f1f..fc3ef7c3e90a 100644
--- a/sys/compat/linux/linux_event.c
+++ b/sys/compat/linux/linux_event.c
@@ -104,7 +104,7 @@ static int
 epoll_create_common(struct thread *td, int flags)
 {
 
-	return (kern_kqueue(td, flags, NULL));
+	return (kern_kqueue(td, flags, false, NULL));
 }
 
 #ifdef LINUX_LEGACY_SYSCALLS
diff --git a/sys/kern/kern_event.c b/sys/kern/kern_event.c
index 59d3e39aef93..7a6c395df86f 100644
--- a/sys/kern/kern_event.c
+++ b/sys/kern/kern_event.c
@@ -134,6 +134,7 @@ static fo_kqfilter_t	kqueue_kqfilter;
 static fo_stat_t	kqueue_stat;
 static fo_close_t	kqueue_close;
 static fo_fill_kinfo_t	kqueue_fill_kinfo;
+static fo_fork_t	kqueue_fork;
 
 static const struct fileops kqueueops = {
 	.fo_read = invfo_rdwr,
@@ -148,7 +149,9 @@ static const struct fileops kqueueops = {
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
 	.fo_cmp = file_kcmp_generic,
+	.fo_fork = kqueue_fork,
 	.fo_fill_kinfo = kqueue_fill_kinfo,
+	.fo_flags = DFLAG_FORK,
 };
 
 static int 	knote_attach(struct knote *kn, struct kqueue *kq);
@@ -1151,7 +1154,7 @@ int
 sys_kqueue(struct thread *td, struct kqueue_args *uap)
 {
 
-	return (kern_kqueue(td, 0, NULL));
+	return (kern_kqueue(td, 0, false, NULL));
 }
 
 int
@@ -1159,27 +1162,30 @@ sys_kqueuex(struct thread *td, struct kqueuex_args *uap)
 {
 	int flags;
 
-	if ((uap->flags & ~(KQUEUE_CLOEXEC)) != 0)
+	if ((uap->flags & ~(KQUEUE_CLOEXEC | KQUEUE_CPONFORK)) != 0)
 		return (EINVAL);
 	flags = 0;
 	if ((uap->flags & KQUEUE_CLOEXEC) != 0)
 		flags |= O_CLOEXEC;
-	return (kern_kqueue(td, flags, NULL));
+	return (kern_kqueue(td, flags, (uap->flags & KQUEUE_CPONFORK) != 0,
+	    NULL));
 }
 
 static void
-kqueue_init(struct kqueue *kq)
+kqueue_init(struct kqueue *kq, bool cponfork)
 {
 
 	mtx_init(&kq->kq_lock, "kqueue", NULL, MTX_DEF | MTX_DUPOK);
 	TAILQ_INIT(&kq->kq_head);
 	knlist_init_mtx(&kq->kq_sel.si_note, &kq->kq_lock);
 	TASK_INIT(&kq->kq_task, 0, kqueue_task, kq);
+	if (cponfork)
+		kq->kq_state |= KQ_CPONFORK;
 }
 
 static int
 kern_kqueue_alloc(struct thread *td, struct filedesc *fdp, int *fdip,
-    struct file **fpp, int flags, struct filecaps *fcaps,
+    struct file **fpp, int flags, struct filecaps *fcaps, bool cponfork,
     struct kqueue **kqp)
 {
 	struct ucred *cred;
@@ -1191,7 +1197,7 @@ kern_kqueue_alloc(struct thread *td, struct filedesc *fdp, int *fdip,
 		return (ENOMEM);
 
 	error = fdip != NULL ? falloc_caps(td, fpp, fdip, flags, fcaps) :
-	    _falloc_noinstall(td, fpp, 2);
+	    _falloc_noinstall(td, fpp, 1);
 	if (error != 0) {
 		chgkqcnt(cred->cr_ruidinfo, -1, 0);
 		return (error);
@@ -1199,31 +1205,33 @@ kern_kqueue_alloc(struct thread *td, struct filedesc *fdp, int *fdip,
 
 	/* An extra reference on `fp' has been held for us by falloc(). */
 	kq = malloc(sizeof(*kq), M_KQUEUE, M_WAITOK | M_ZERO);
-	kqueue_init(kq);
+	kqueue_init(kq, cponfork);
 	kq->kq_fdp = fdp;
 	kq->kq_cred = crhold(cred);
 
-	FILEDESC_XLOCK(fdp);
+	if (fdip != NULL)
+		FILEDESC_XLOCK(fdp);
 	TAILQ_INSERT_HEAD(&fdp->fd_kqlist, kq, kq_list);
-	FILEDESC_XUNLOCK(fdp);
+	if (fdip != NULL)
+		FILEDESC_XUNLOCK(fdp);
 
+	finit(*fpp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
 	*kqp = kq;
 	return (0);
 }
 
 int
-kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps)
+kern_kqueue(struct thread *td, int flags, bool cponfork, struct filecaps *fcaps)
 {
 	struct kqueue *kq;
 	struct file *fp;
 	int fd, error;
 
 	error = kern_kqueue_alloc(td, td->td_proc->p_fd, &fd, &fp, flags,
-	    fcaps, &kq);
+	    fcaps, cponfork, &kq);
 	if (error != 0)
 		return (error);
 
-	finit(fp, FREAD | FWRITE, DTYPE_KQUEUE, kq, &kqueueops);
 	fdrop(fp, td);
 
 	td->td_retval[0] = fd;
@@ -1504,7 +1512,7 @@ kern_kevent_anonymous(struct thread *td, int nevents,
 	struct kqueue kq = {};
 	int error;
 
-	kqueue_init(&kq);
+	kqueue_init(&kq, false);
 	kq.kq_refcnt = 1;
 	error = kqueue_kevent(&kq, td, nevents, nevents, k_ops, NULL);
 	kqueue_drain(&kq, td);
@@ -2958,6 +2966,146 @@ noacquire:
 	return (error);
 }
 
+static int
+kqueue_fork_alloc(struct filedesc *fdp, struct file *fp, struct file **fp1,
+    struct thread *td)
+{
+	struct kqueue *kq, *kq1;
+	int error;
+
+	MPASS(fp->f_type == DTYPE_KQUEUE);
+	kq = fp->f_data;
+	if ((kq->kq_state & KQ_CPONFORK) == 0)
+		return (EOPNOTSUPP);
+	error = kqueue_acquire_ref(kq);
+	if (error != 0)
+		return (error);
+	error = kern_kqueue_alloc(td, fdp, NULL, fp1, 0, NULL, true, &kq1);
+	if (error == 0) {
+		kq1->kq_forksrc = kq;
+		(*fp1)->f_flag = fp->f_flag & (FREAD | FWRITE | FEXEC |
+		    O_CLOEXEC | O_CLOFORK);
+	} else {
+		kqueue_release(kq, 0);
+	}
+	return (error);
+}
+
+static void
+kqueue_fork_copy_knote(struct kqueue *kq1, struct knote *kn, struct proc *p1,
+    struct filedesc *fdp)
+{
+	struct knote *kn1;
+	const struct filterops *fop;
+	int error;
+
+	fop = kn->kn_fop;
+	if (fop->f_copy == NULL || (fop->f_isfd &&
+	    fdp->fd_files->fdt_ofiles[kn->kn_kevent.ident].fde_file == NULL))
+		return;
+	error = kqueue_expand(kq1, fop, kn->kn_kevent.ident, M_WAITOK);
+	if (error != 0)
+		return;
+
+	kn1 = knote_alloc(M_WAITOK);
+	*kn1 = *kn;
+	kn1->kn_status |= KN_DETACHED;
+	kn1->kn_status &= ~KN_QUEUED;
+	kn1->kn_kq = kq1;
+	error = fop->f_copy(kn1, p1);
+	if (error != 0) {
+		knote_free(kn1);
+		return;
+	}
+	(void)kqueue_fo_find(kn->kn_kevent.filter);
+	if (fop->f_isfd && !fhold(kn1->kn_fp)) {
+		fop->f_detach(kn1);
+		kqueue_fo_release(kn->kn_kevent.filter);
+		knote_free(kn1);
+		return;
+	}
+	if (kn->kn_knlist != NULL)
+		knlist_add(kn->kn_knlist, kn1, 0);
+	KQ_LOCK(kq1);
+	knote_attach(kn1, kq1);
+	kn1->kn_influx = 0;
+	if ((kn->kn_status & KN_QUEUED) != 0)
+		knote_enqueue(kn1);
+	KQ_UNLOCK(kq1);
+}
+
+static void
+kqueue_fork_copy_list(struct klist *knlist, struct knote *marker,
+    struct kqueue *kq, struct kqueue *kq1, struct proc *p1,
+    struct filedesc *fdp)
+{
+	struct knote *kn;
+
+	KQ_OWNED(kq);
+	kn = SLIST_FIRST(knlist);
+	while (kn != NULL) {
+		if ((kn->kn_status & KN_DETACHED) != 0 ||
+		    (kn_in_flux(kn) && (kn->kn_status & KN_SCAN) == 0)) {
+			kn = SLIST_NEXT(kn, kn_link);
+			continue;
+		}
+		kn_enter_flux(kn);
+		SLIST_INSERT_AFTER(kn, marker, kn_link);
+		KQ_UNLOCK(kq);
+		kqueue_fork_copy_knote(kq1, kn, p1, fdp);
+		KQ_LOCK(kq);
+		kn_leave_flux(kn);
+		kn = SLIST_NEXT(marker, kn_link);
+		/* XXXKIB switch kn_link to LIST? */
+		SLIST_REMOVE(knlist, marker, knote, kn_link);
+	}
+}
+
+static int
+kqueue_fork_copy(struct filedesc *fdp, struct file *fp, struct file *fp1,
+    struct proc *p1, struct thread *td)
+{
+	struct kqueue *kq, *kq1;
+	struct knote *marker;
+	int error, i;
+
+	error = 0;
+	MPASS(fp == NULL);
+	MPASS(fp1->f_type == DTYPE_KQUEUE);
+
+	kq1 = fp1->f_data;
+	kq = kq1->kq_forksrc;
+	marker = knote_alloc(M_WAITOK);
+	marker->kn_status = KN_MARKER;
+
+	KQ_LOCK(kq);
+	for (i = 0; i < kq->kq_knlistsize; i++) {
+		kqueue_fork_copy_list(&kq->kq_knlist[i], marker, kq, kq1,
+		    p1, fdp);
+	}
+	if (kq->kq_knhashmask != 0) {
+		for (i = 0; i <= kq->kq_knhashmask; i++) {
+			kqueue_fork_copy_list(&kq->kq_knhash[i], marker, kq,
+			    kq1, p1, fdp);
+		}
+	}
+	kqueue_release(kq, 1);
+	kq1->kq_forksrc = NULL;
+	KQ_UNLOCK(kq);
+
+	knote_free(marker);
+	return (error);
+}
+
+static int
+kqueue_fork(struct filedesc *fdp, struct file *fp, struct file **fp1,
+    struct proc *p1, struct thread *td)
+{
+	if (*fp1 == NULL)
+		return (kqueue_fork_alloc(fdp, fp, fp1, td));
+	return (kqueue_fork_copy(fdp, fp, *fp1, p1, td));
+}
+
 struct knote_status_export_bit {
 	int kn_status_bit;
 	int knt_status_bit;
diff --git a/sys/sys/event.h b/sys/sys/event.h
index 084eaafcbdc0..6e71445f03b0 100644
--- a/sys/sys/event.h
+++ b/sys/sys/event.h
@@ -228,6 +228,7 @@ struct freebsd11_kevent32 {
 
 /* Flags for kqueuex(2) */
 #define	KQUEUE_CLOEXEC	0x00000001	/* close on exec */
+#define	KQUEUE_CPONFORK	0x00000002	/* copy on fork */
 
 struct knote;
 SLIST_HEAD(klist, knote);
@@ -283,6 +284,7 @@ struct filterops {
 	void	(*f_touch)(struct knote *kn, struct kevent *kev, u_long type);
 	int	(*f_userdump)(struct proc *p, struct knote *kn,
 		    struct kinfo_knote *kin);
+	int	(*f_copy)(struct knote *kn, struct proc *p1);
 };
 
 /*
diff --git a/sys/sys/eventvar.h b/sys/sys/eventvar.h
index 7fec444447f9..7cb3269f1fdf 100644
--- a/sys/sys/eventvar.h
+++ b/sys/sys/eventvar.h
@@ -55,12 +55,14 @@ struct kqueue {
 #define KQ_CLOSING	0x10
 #define	KQ_TASKSCHED	0x20			/* task scheduled */
 #define	KQ_TASKDRAIN	0x40			/* waiting for task to drain */
+#define	KQ_CPONFORK	0x80
 	int		kq_knlistsize;		/* size of knlist */
 	struct		klist *kq_knlist;	/* list of knotes */
 	u_long		kq_knhashmask;		/* size of knhash */
 	struct		klist *kq_knhash;	/* hash table for knotes */
 	struct		task kq_task;
 	struct		ucred *kq_cred;
+	struct		kqueue *kq_forksrc;
 };
 
 #endif /* !_SYS_EVENTVAR_H_ */
diff --git a/sys/sys/syscallsubr.h b/sys/sys/syscallsubr.h
index 8237165b84ce..d32690634059 100644
--- a/sys/sys/syscallsubr.h
+++ b/sys/sys/syscallsubr.h
@@ -211,7 +211,8 @@ int	kern_kevent_fp(struct thread *td, struct file *fp, int nchanges,
 	    int nevents, struct kevent_copyops *k_ops,
 	    const struct timespec *timeout);
 int	kern_kill(struct thread *td, pid_t pid, int signum);
-int	kern_kqueue(struct thread *td, int flags, struct filecaps *fcaps);
+int	kern_kqueue(struct thread *td, int flags, bool cponfork,
+	    struct filecaps *fcaps);
 int	kern_kldload(struct thread *td, const char *file, int *fileid);
 int	kern_kldstat(struct thread *td, int fileid, struct kld_file_stat *stat);
 int	kern_kldunload(struct thread *td, int fileid, int flags);