git: 1390bba42caf - main - file: Add a fdclose method

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Sun, 16 Nov 2025 18:24:20 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=1390bba42caf53a00fa370f3844cd7b3725ed4ec

commit 1390bba42caf53a00fa370f3844cd7b3725ed4ec
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2025-11-16 15:49:39 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2025-11-16 18:20:15 +0000

    file: Add a fdclose method
    
    Consider a program that creates a unix socket pair, transmits both
    sockets from one to the other using an SCM_RIGHTS message, and then
    closes both sockets without externalizing the message.  unp_gc() is
    supposed to handle cleanup, but it is only triggered by uipc_detach(),
    which runs when a unix socket is destroyed.  Because the two sockets are
    internalized, their refcounts are positive, so uipc_detach() isn't
    called.
    
    As a result, a userspace program can create an unbounded amount of
    garbage without triggering reclaim.  Let's trigger garbage collection
    whenever a unix socket is close()d.  To implement this, add new a
    fdclose file op and protocol op, and implement them accordingly.  Since
    mqueuefs has a hack to hook into the file close path, convert it to use
    the new op as well.
    
    Now, userspace can't create garbage without triggering reclamation.
    
    Reviewed by:    glebius, kib
    MFC after:      2 weeks
    Differential Revision:  https://reviews.freebsd.org/D53744
---
 sys/kern/kern_descrip.c |  8 ++-----
 sys/kern/sys_socket.c   | 12 ++++++++++
 sys/kern/uipc_mqueue.c  | 59 +++++++++++++++++++++++--------------------------
 sys/kern/uipc_socket.c  |  3 ++-
 sys/kern/uipc_usrreq.c  | 27 ++++++++++++++++++----
 sys/sys/file.h          |  2 ++
 sys/sys/mqueue.h        |  5 -----
 sys/sys/protosw.h       |  7 ++++--
 8 files changed, 74 insertions(+), 49 deletions(-)

diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index a71a601733e5..2fa0621bdfca 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -194,7 +194,6 @@ struct filedesc0 {
  */
 static int __exclusive_cache_line openfiles; /* actual number of open files */
 struct mtx sigio_lock;		/* mtx to protect pointers to sigio */
-void __read_mostly (*mq_fdclose)(struct thread *td, int fd, struct file *fp);
 
 /*
  * If low >= size, just return low. Otherwise find the first zero bit in the
@@ -1413,11 +1412,8 @@ closefp_impl(struct filedesc *fdp, int fd, struct file *fp, struct thread *td,
 	if (__predict_false(!TAILQ_EMPTY(&fdp->fd_kqlist)))
 		knote_fdclose(td, fd);
 
-	/*
-	 * We need to notify mqueue if the object is of type mqueue.
-	 */
-	if (__predict_false(fp->f_type == DTYPE_MQUEUE))
-		mq_fdclose(td, fd, fp);
+	if (fp->f_ops->fo_fdclose != NULL)
+		fp->f_ops->fo_fdclose(fp, fd, td);
 	FILEDESC_XUNLOCK(fdp);
 
 #ifdef AUDIT
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
index bc0725230cca..8cf5ffcdbb96 100644
--- a/sys/kern/sys_socket.c
+++ b/sys/kern/sys_socket.c
@@ -90,6 +90,7 @@ static fo_poll_t soo_poll;
 static fo_kqfilter_t soo_kqfilter;
 static fo_stat_t soo_stat;
 static fo_close_t soo_close;
+static fo_fdclose_t soo_fdclose;
 static fo_chmod_t soo_chmod;
 static fo_fill_kinfo_t soo_fill_kinfo;
 static fo_aio_queue_t soo_aio_queue;
@@ -105,6 +106,7 @@ const struct fileops socketops = {
 	.fo_kqfilter = soo_kqfilter,
 	.fo_stat = soo_stat,
 	.fo_close = soo_close,
+	.fo_fdclose = soo_fdclose,
 	.fo_chmod = soo_chmod,
 	.fo_chown = invfo_chown,
 	.fo_sendfile = invfo_sendfile,
@@ -362,6 +364,16 @@ soo_close(struct file *fp, struct thread *td)
 	return (error);
 }
 
+static void
+soo_fdclose(struct file *fp, int fd __unused, struct thread *td)
+{
+	struct socket *so;
+
+	so = fp->f_data;
+	if (so->so_proto->pr_fdclose != NULL)
+		so->so_proto->pr_fdclose(so);
+}
+
 static int
 soo_chmod(struct file *fp, mode_t mode, struct ucred *cred, struct thread *td)
 {
diff --git a/sys/kern/uipc_mqueue.c b/sys/kern/uipc_mqueue.c
index 4c1bb1ff228e..68eda5ecb039 100644
--- a/sys/kern/uipc_mqueue.c
+++ b/sys/kern/uipc_mqueue.c
@@ -267,7 +267,6 @@ static int	_mqueue_send(struct mqueue *mq, struct mqueue_msg *msg,
 static int	_mqueue_recv(struct mqueue *mq, struct mqueue_msg **msg,
 			int timo);
 static void	mqueue_send_notification(struct mqueue *mq);
-static void	mqueue_fdclose(struct thread *td, int fd, struct file *fp);
 static void	mq_proc_exit(void *arg, struct proc *p);
 
 /*
@@ -688,7 +687,6 @@ mqfs_init(struct vfsconf *vfc)
 	mqfs_fixup_dir(root);
 	exit_tag = EVENTHANDLER_REGISTER(process_exit, mq_proc_exit, NULL,
 	    EVENTHANDLER_PRI_ANY);
-	mq_fdclose = mqueue_fdclose;
 	p31b_setcfg(CTL_P1003_1B_MESSAGE_PASSING, _POSIX_MESSAGE_PASSING);
 	mqfs_osd_jail_slot = osd_jail_register(NULL, methods);
 	return (0);
@@ -2472,35 +2470,6 @@ sys_kmq_notify(struct thread *td, struct kmq_notify_args *uap)
 	return (kern_kmq_notify(td, uap->mqd, evp));
 }
 
-static void
-mqueue_fdclose(struct thread *td, int fd, struct file *fp)
-{
-	struct mqueue *mq;
-#ifdef INVARIANTS
-	struct filedesc *fdp;
-
-	fdp = td->td_proc->p_fd;
-	FILEDESC_LOCK_ASSERT(fdp);
-#endif
-
-	if (fp->f_ops == &mqueueops) {
-		mq = FPTOMQ(fp);
-		mtx_lock(&mq->mq_mutex);
-		notifier_remove(td->td_proc, mq, fd);
-
-		/* have to wakeup thread in same process */
-		if (mq->mq_flags & MQ_RSEL) {
-			mq->mq_flags &= ~MQ_RSEL;
-			selwakeup(&mq->mq_rsel);
-		}
-		if (mq->mq_flags & MQ_WSEL) {
-			mq->mq_flags &= ~MQ_WSEL;
-			selwakeup(&mq->mq_wsel);
-		}
-		mtx_unlock(&mq->mq_mutex);
-	}
-}
-
 static void
 mq_proc_exit(void *arg __unused, struct proc *p)
 {
@@ -2566,6 +2535,33 @@ mqf_close(struct file *fp, struct thread *td)
 	return (0);
 }
 
+static void
+mqf_fdclose(struct file *fp, int fd, struct thread *td)
+{
+	struct mqueue *mq;
+#ifdef INVARIANTS
+	struct filedesc *fdp;
+
+	fdp = td->td_proc->p_fd;
+	FILEDESC_LOCK_ASSERT(fdp);
+#endif
+
+	mq = FPTOMQ(fp);
+	mtx_lock(&mq->mq_mutex);
+	notifier_remove(td->td_proc, mq, fd);
+
+	/* have to wakeup thread in same process */
+	if (mq->mq_flags & MQ_RSEL) {
+		mq->mq_flags &= ~MQ_RSEL;
+		selwakeup(&mq->mq_rsel);
+	}
+	if (mq->mq_flags & MQ_WSEL) {
+		mq->mq_flags &= ~MQ_WSEL;
+		selwakeup(&mq->mq_wsel);
+	}
+	mtx_unlock(&mq->mq_mutex);
+}
+
 static int
 mqf_stat(struct file *fp, struct stat *st, struct ucred *active_cred)
 {
@@ -2694,6 +2690,7 @@ static const struct fileops mqueueops = {
 	.fo_kqfilter		= mqf_kqfilter,
 	.fo_stat		= mqf_stat,
 	.fo_close		= mqf_close,
+	.fo_fdclose		= mqf_fdclose,
 	.fo_chmod		= mqf_chmod,
 	.fo_chown		= mqf_chown,
 	.fo_sendfile		= invfo_sendfile,
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index eb9544628137..00aa5f9309b2 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -54,7 +54,8 @@
  * consumer of a socket is starting to tear down the socket, and that the
  * protocol should terminate the connection.  Historically, pr_abort() also
  * detached protocol state from the socket state, but this is no longer the
- * case.
+ * case.  pr_fdclose() is called when userspace invokes close(2) on a socket
+ * file descriptor.
  *
  * socreate() creates a socket and attaches protocol state.  This is a public
  * interface that may be used by socket layer consumers to create new
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index 6996f3d447bd..60736af5adf6 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -792,13 +792,19 @@ uipc_connect2(struct socket *so1, struct socket *so2)
 	return (0);
 }
 
+static void
+maybe_schedule_gc(void)
+{
+	if (atomic_load_int(&unp_rights) != 0)
+		taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
+}
+
 static void
 uipc_detach(struct socket *so)
 {
 	struct unpcb *unp, *unp2;
 	struct mtx *vplock;
 	struct vnode *vp;
-	int local_unp_rights;
 
 	unp = sotounpcb(so);
 	KASSERT(unp != NULL, ("uipc_detach: unp == NULL"));
@@ -854,7 +860,6 @@ uipc_detach(struct socket *so)
 	UNP_REF_LIST_UNLOCK();
 
 	UNP_PCB_LOCK(unp);
-	local_unp_rights = unp_rights;
 	unp->unp_socket->so_pcb = NULL;
 	unp->unp_socket = NULL;
 	free(unp->unp_addr, M_SONAME);
@@ -865,8 +870,7 @@ uipc_detach(struct socket *so)
 		mtx_unlock(vplock);
 		vrele(vp);
 	}
-	if (local_unp_rights)
-		taskqueue_enqueue_timeout(taskqueue_thread, &unp_gc_task, -1);
+	maybe_schedule_gc();
 
 	switch (so->so_type) {
 	case SOCK_STREAM:
@@ -902,6 +906,18 @@ uipc_disconnect(struct socket *so)
 	return (0);
 }
 
+static void
+uipc_fdclose(struct socket *so __unused)
+{
+	/*
+	 * Ensure that userspace can't create orphaned file descriptors without
+	 * triggering garbage collection.  Triggering GC from uipc_detach() is
+	 * not sufficient, since that's only closed once a socket reference
+	 * count drops to zero.
+	 */
+	maybe_schedule_gc();
+}
+
 static int
 uipc_listen(struct socket *so, int backlog, struct thread *td)
 {
@@ -4372,6 +4388,7 @@ static struct protosw streamproto = {
 	.pr_connect2 =		uipc_connect2,
 	.pr_detach =		uipc_detach,
 	.pr_disconnect =	uipc_disconnect,
+	.pr_fdclose =		uipc_fdclose,
 	.pr_listen =		uipc_listen,
 	.pr_peeraddr =		uipc_peeraddr,
 	.pr_send =		uipc_sendfile,
@@ -4402,6 +4419,7 @@ static struct protosw dgramproto = {
 	.pr_connect2 =		uipc_connect2,
 	.pr_detach =		uipc_detach,
 	.pr_disconnect =	uipc_disconnect,
+	.pr_fdclose =		uipc_fdclose,
 	.pr_peeraddr =		uipc_peeraddr,
 	.pr_sosend =		uipc_sosend_dgram,
 	.pr_sense =		uipc_sense,
@@ -4426,6 +4444,7 @@ static struct protosw seqpacketproto = {
 	.pr_connect2 =		uipc_connect2,
 	.pr_detach =		uipc_detach,
 	.pr_disconnect =	uipc_disconnect,
+	.pr_fdclose =		uipc_fdclose,
 	.pr_listen =		uipc_listen,
 	.pr_peeraddr =		uipc_peeraddr,
 	.pr_sense =		uipc_sense,
diff --git a/sys/sys/file.h b/sys/sys/file.h
index e0195c7c6c2a..89dba2b1c3f8 100644
--- a/sys/sys/file.h
+++ b/sys/sys/file.h
@@ -116,6 +116,7 @@ typedef	int fo_kqfilter_t(struct file *fp, struct knote *kn);
 typedef	int fo_stat_t(struct file *fp, struct stat *sb,
 		    struct ucred *active_cred);
 typedef	int fo_close_t(struct file *fp, struct thread *td);
+typedef	void fo_fdclose_t(struct file *fp, int fd, struct thread *td);
 typedef	int fo_chmod_t(struct file *fp, mode_t mode,
 		    struct ucred *active_cred, struct thread *td);
 typedef	int fo_chown_t(struct file *fp, uid_t uid, gid_t gid,
@@ -153,6 +154,7 @@ struct fileops {
 	fo_kqfilter_t	*fo_kqfilter;
 	fo_stat_t	*fo_stat;
 	fo_close_t	*fo_close;
+	fo_fdclose_t	*fo_fdclose;
 	fo_chmod_t	*fo_chmod;
 	fo_chown_t	*fo_chown;
 	fo_sendfile_t	*fo_sendfile;
diff --git a/sys/sys/mqueue.h b/sys/sys/mqueue.h
index 50f6681ce218..211f09d57427 100644
--- a/sys/sys/mqueue.h
+++ b/sys/sys/mqueue.h
@@ -37,9 +37,4 @@ struct mq_attr {
 	long    __reserved[4];  /* Ignored for input, zeroed for output */
 };
 
-#ifdef _KERNEL
-struct thread;
-struct file;
-extern void	(*mq_fdclose)(struct thread *td, int fd, struct file *fp);
-#endif
 #endif
diff --git a/sys/sys/protosw.h b/sys/sys/protosw.h
index 7fb8dfdc7208..392ff7cf678e 100644
--- a/sys/sys/protosw.h
+++ b/sys/sys/protosw.h
@@ -94,6 +94,7 @@ typedef int	pr_sopoll_t(struct socket *, int, struct thread *);
 typedef int	pr_kqfilter_t(struct socket *, struct knote *);
 typedef void	pr_sosetlabel_t(struct socket *);
 typedef void	pr_close_t(struct socket *);
+typedef void	pr_fdclose_t(struct socket *);
 typedef int	pr_bindat_t(int, struct socket *, struct sockaddr *,
 		    struct thread *);
 typedef int	pr_connectat_t(int, struct socket *, struct sockaddr *,
@@ -120,8 +121,8 @@ struct protosw {
 	pr_detach_t	*pr_detach;	/* destruction: sofree() */
 	pr_connect_t	*pr_connect;	/* connect(2) */
 	pr_disconnect_t	*pr_disconnect;	/* sodisconnect() */
-	pr_close_t	*pr_close;	/* close(2) */
-	pr_shutdown_t	*pr_shutdown;	/* shutdown(2) */
+	pr_close_t	*pr_close;	/* soclose(), socket refcount is 0 */
+	pr_fdclose_t	*pr_fdclose;	/* close(2) */
 	pr_rcvd_t	*pr_rcvd;	/* soreceive_generic() if PR_WANTRCVD */
 	pr_aio_queue_t	*pr_aio_queue;	/* aio(9) */
 /* Cache line #3 */
@@ -142,7 +143,9 @@ struct protosw {
 	pr_sosetlabel_t	*pr_sosetlabel;	/* MAC, XXXGL: remove */
 	pr_setsbopt_t	*pr_setsbopt;	/* Socket buffer ioctls */
 	pr_chmod_t	*pr_chmod;	/* fchmod(2) */
+/* Cache line #5 */
 	pr_kqfilter_t	*pr_kqfilter;	/* kevent(2) */
+	pr_shutdown_t	*pr_shutdown;	/* shutdown(2) */
 };
 #endif	/* defined(_KERNEL) || defined(_WANT_PROTOSW) */
 #ifdef _KERNEL