git: 3b263fa76c08 - main - if_tuntap: Try to fix device refcount bugs

From: Mark Johnston <markj_at_FreeBSD.org>
Date: Mon, 28 Jul 2025 16:19:49 UTC
The branch main has been updated by markj:

URL: https://cgit.FreeBSD.org/src/commit/?id=3b263fa76c08993f37ce94cb7cd4605f67dd5965

commit 3b263fa76c08993f37ce94cb7cd4605f67dd5965
Author:     Mark Johnston <markj@FreeBSD.org>
AuthorDate: 2025-07-28 16:03:20 +0000
Commit:     Mark Johnston <markj@FreeBSD.org>
CommitDate: 2025-07-28 16:19:38 +0000

    if_tuntap: Try to fix device refcount bugs
    
    There are two ways to create a tun device, via devfs cloning or with
    ifconfig.  The latter is implemented by tun_clone_create() and the
    former by tunclone(), which invokes tun_clone_create() via
    if_clone_create().  Both of these functions were invoking dev_ref()
    after creating the devfs_node(), but this was extraneous.  tunclone()
    does need to acquire an extra reference since this is required by the
    dev_clone EVENTHANDLER interface contract, but it was already doing so
    by specifying MAKEDEV_REF.  Fix this by removing unnecessary refcount
    acquisitions.
    
    A second problem is with teardown in a VNET jail.  A tun interface
    created by device cloning will hold a credential reference for the jail,
    which prevents it from being destroyed and in particular prevents VNET
    SYSUNINITs from running.  To fix this, we need to register a
    PR_METHOD_REMOVE callback for jail teardown which, in a VNET jail,
    destroys cloned interfaces.  This way, jail teardown can proceed.
    
    These bugs are noticeable with something like
    
      # jail -c name=test vnet command=ls /dev/tun
      # jls -vd
    
    While here, add some comments and be sure to destroy a nascent mutex and
    condition variable in an error path.
    
    Reviewed by:    kib
    MFC after:      1 month
    Sponsored by:   Stormshield
    Sponsored by:   Klara, Inc.
    Differential Revision:  https://reviews.freebsd.org/D51525
---
 sys/net/if_tuntap.c | 76 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 58 insertions(+), 18 deletions(-)

diff --git a/sys/net/if_tuntap.c b/sys/net/if_tuntap.c
index 3bab04aa4d38..5e6f65c04b2f 100644
--- a/sys/net/if_tuntap.c
+++ b/sys/net/if_tuntap.c
@@ -74,6 +74,7 @@
 #include <sys/malloc.h>
 #include <sys/random.h>
 #include <sys/ctype.h>
+#include <sys/osd.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
@@ -178,6 +179,7 @@ struct tuntap_softc {
 static struct mtx tunmtx;
 static eventhandler_tag arrival_tag;
 static eventhandler_tag clone_tag;
+static int tuntap_osd_jail_slot;
 static const char tunname[] = "tun";
 static const char tapname[] = "tap";
 static const char vmnetname[] = "vmnet";
@@ -497,6 +499,10 @@ vmnet_clone_match(struct if_clone *ifc, const char *name)
 	return (0);
 }
 
+/*
+ * Create a clone via the ifnet cloning mechanism.  Note that this is invoked
+ * indirectly by tunclone() below.
+ */
 static int
 tun_clone_create(struct if_clone *ifc, char *name, size_t len,
     struct ifc_data *ifd, struct ifnet **ifpp)
@@ -532,15 +538,19 @@ tun_clone_create(struct if_clone *ifc, char *name, size_t len,
 	if (i != 0)
 		i = tun_create_device(drv, unit, NULL, &dev, name);
 	if (i == 0) {
-		dev_ref(dev);
+		struct tuntap_softc *tp;
+
 		tuncreate(dev);
-		struct tuntap_softc *tp = dev->si_drv1;
+		tp = dev->si_drv1;
 		*ifpp = tp->tun_ifp;
 	}
 
 	return (i);
 }
 
+/*
+ * Create a clone via devfs access.
+ */
 static void
 tunclone(void *arg, struct ucred *cred, char *name, int namelen,
     struct cdev **dev)
@@ -595,11 +605,12 @@ tunclone(void *arg, struct ucred *cred, char *name, int namelen,
 		}
 
 		i = tun_create_device(drv, u, cred, dev, name);
-	}
-	if (i == 0) {
+	} else {
+		/* Consumed by the dev_clone invoker. */
 		dev_ref(*dev);
-		if_clone_create(name, namelen, NULL);
 	}
+	if (i == 0)
+		if_clone_create(name, namelen, NULL);
 out:
 	CURVNET_RESTORE();
 }
@@ -669,16 +680,6 @@ vnet_tun_init(const void *unused __unused)
 VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
 		vnet_tun_init, NULL);
 
-static void
-vnet_tun_uninit(const void *unused __unused)
-{
-
-	for (u_int i = 0; i < NDRV; ++i)
-		if_clone_detach(V_tuntap_driver_cloners[i]);
-}
-VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
-    vnet_tun_uninit, NULL);
-
 static void
 tun_uninit(const void *unused __unused)
 {
@@ -689,6 +690,16 @@ tun_uninit(const void *unused __unused)
 	EVENTHANDLER_DEREGISTER(ifnet_arrival_event, arrival_tag);
 	EVENTHANDLER_DEREGISTER(dev_clone, clone_tag);
 
+	CURVNET_SET(vnet0);
+	for (u_int i = 0; i < NDRV; i++) {
+		if_clone_detach(V_tuntap_driver_cloners[i]);
+		V_tuntap_driver_cloners[i] = NULL;
+	}
+	CURVNET_RESTORE();
+
+	if (tuntap_osd_jail_slot != 0)
+		osd_jail_deregister(tuntap_osd_jail_slot);
+
 	mtx_lock(&tunmtx);
 	while ((tp = TAILQ_FIRST(&tunhead)) != NULL) {
 		TAILQ_REMOVE(&tunhead, tp, tun_list);
@@ -724,6 +735,30 @@ tuntap_driver_from_ifnet(const struct ifnet *ifp)
 	return (NULL);
 }
 
+/*
+ * Remove devices that were created by devfs cloning, as they hold references
+ * which prevent the prison from collapsing, in which state VNET sysuninits will
+ * not be invoked.
+ */
+static int
+tuntap_prison_remove(void *obj, void *data __unused)
+{
+#ifdef VIMAGE
+	struct prison *pr;
+
+	pr = obj;
+	if (prison_owns_vnet(pr)) {
+		CURVNET_SET(pr->pr_vnet);
+		for (u_int i = 0; i < NDRV; i++) {
+			if_clone_detach(V_tuntap_driver_cloners[i]);
+			V_tuntap_driver_cloners[i] = NULL;
+		}
+		CURVNET_RESTORE();
+	}
+#endif
+	return (0);
+}
+
 static int
 tuntapmodevent(module_t mod, int type, void *data)
 {
@@ -738,8 +773,12 @@ tuntapmodevent(module_t mod, int type, void *data)
 			clone_setup(&drv->clones);
 			drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx);
 		}
+		osd_method_t methods[PR_MAXMETHOD] = {
+			[PR_METHOD_REMOVE] = tuntap_prison_remove,
+		};
+		tuntap_osd_jail_slot = osd_jail_register(NULL, methods);
 		arrival_tag = EVENTHANDLER_REGISTER(ifnet_arrival_event,
-		   tunrename, 0, 1000);
+		    tunrename, 0, 1000);
 		if (arrival_tag == NULL)
 			return (ENOMEM);
 		clone_tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000);
@@ -747,7 +786,7 @@ tuntapmodevent(module_t mod, int type, void *data)
 			return (ENOMEM);
 		break;
 	case MOD_UNLOAD:
-		/* See tun_uninit, so it's done after the vnet_sysuninit() */
+		/* See tun_uninit(). */
 		break;
 	default:
 		return EOPNOTSUPP;
@@ -798,6 +837,8 @@ tun_create_device(struct tuntap_driver *drv, int unit, struct ucred *cr,
 	args.mda_si_drv1 = tp;
 	error = make_dev_s(&args, dev, "%s", name);
 	if (error != 0) {
+		mtx_destroy(&tp->tun_mtx);
+		cv_destroy(&tp->tun_cv);
 		free(tp, M_TUN);
 		return (error);
 	}
@@ -914,7 +955,6 @@ tap_transmit(struct ifnet *ifp, struct mbuf *m)
 	return (error);
 }
 
-/* XXX: should return an error code so it can fail. */
 static void
 tuncreate(struct cdev *dev)
 {