git: c746ed724d01 - main - Allow stacked filesystems to be recursively unmounted

Jason A. Harmening jah at FreeBSD.org
Sat Jul 24 20:39:21 UTC 2021


The branch main has been updated by jah:

URL: https://cgit.FreeBSD.org/src/commit/?id=c746ed724d01b439d42aff01cbe88eadacd0ac0d

commit c746ed724d01b439d42aff01cbe88eadacd0ac0d
Author:     Jason A. Harmening <jah at FreeBSD.org>
AuthorDate: 2021-06-12 19:42:12 +0000
Commit:     Jason A. Harmening <jah at FreeBSD.org>
CommitDate: 2021-07-24 19:52:00 +0000

    Allow stacked filesystems to be recursively unmounted
    
    In certain emergency cases such as media failure or removal, UFS will
    initiate a forced unmount in order to prevent dirty buffers from
    accumulating against the no-longer-usable filesystem.  The presence
    of a stacked filesystem such as nullfs or unionfs above the UFS mount
    will prevent this forced unmount from succeeding.
    
    This change addreses the situation by allowing stacked filesystems to
    be recursively unmounted on a taskqueue thread when the MNT_RECURSE
    flag is specified to dounmount().  This call will block until all upper
    mounts have been removed unless the caller specifies the MNT_DEFERRED
    flag to indicate the base filesystem should also be unmounted from the
    taskqueue.
    
    To achieve this, the recently-added vfs_pin_from_vp()/vfs_unpin() KPIs
    have been combined with the existing 'mnt_uppers' list used by nullfs
    and renamed to vfs_register_upper_from_vp()/vfs_unregister_upper().
    The format of the mnt_uppers list has also been changed to accommodate
    filesystems such as unionfs in which a given mount may be stacked atop
    more than one lower mount.  Additionally, management of lower FS
    reclaim/unlink notifications has been split into a separate list
    managed by a separate set of KPIs, as registration of an upper FS no
    longer implies interest in these notifications.
    
    Reviewed by:    kib, mckusick
    Tested by:      pho
    Differential Revision:  https://reviews.freebsd.org/D31016
---
 sys/fs/nullfs/null.h          |   2 +
 sys/fs/nullfs/null_vfsops.c   |  30 ++---
 sys/fs/unionfs/union.h        |   2 +
 sys/fs/unionfs/union_vfsops.c |  14 ++-
 sys/kern/vfs_mount.c          | 280 +++++++++++++++++++++++++++++++++++++-----
 sys/kern/vfs_subr.c           |  55 +++++----
 sys/sys/mount.h               |  51 ++++++--
 sys/ufs/ffs/ffs_vfsops.c      |   2 +-
 8 files changed, 348 insertions(+), 88 deletions(-)

diff --git a/sys/fs/nullfs/null.h b/sys/fs/nullfs/null.h
index 6fdac4b1006e..a41625536d65 100644
--- a/sys/fs/nullfs/null.h
+++ b/sys/fs/nullfs/null.h
@@ -45,6 +45,8 @@ struct null_mount {
 	struct mount	*nullm_vfs;
 	struct vnode	*nullm_lowerrootvp;	/* Ref to lower root vnode */
 	uint64_t	nullm_flags;
+	struct mount_upper_node upper_node;
+	struct mount_upper_node notify_node;
 };
 
 #ifdef _KERNEL
diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c
index 4914e5fc2dbf..73301c9275d2 100644
--- a/sys/fs/nullfs/null_vfsops.c
+++ b/sys/fs/nullfs/null_vfsops.c
@@ -163,7 +163,8 @@ nullfs_mount(struct mount *mp)
 	 * Save pointer to underlying FS and the reference to the
 	 * lower root vnode.
 	 */
-	xmp->nullm_vfs = vfs_pin_from_vp(lowerrootvp);
+	xmp->nullm_vfs = vfs_register_upper_from_vp(lowerrootvp, mp,
+	    &xmp->upper_node);
 	if (xmp->nullm_vfs == NULL) {
 		vput(lowerrootvp);
 		free(xmp, M_NULLFSMNT);
@@ -178,7 +179,7 @@ nullfs_mount(struct mount *mp)
 	 */
 	error = null_nodeget(mp, lowerrootvp, &nullm_rootvp);
 	if (error != 0) {
-		vfs_unpin(xmp->nullm_vfs);
+		vfs_unregister_upper(xmp->nullm_vfs, &xmp->upper_node);
 		vrele(lowerrootvp);
 		free(xmp, M_NULLFSMNT);
 		return (error);
@@ -195,6 +196,11 @@ nullfs_mount(struct mount *mp)
 	    (xmp->nullm_vfs->mnt_kern_flag & MNTK_NULL_NOCACHE) != 0)
 		xmp->nullm_flags &= ~NULLM_CACHE;
 
+	if ((xmp->nullm_flags & NULLM_CACHE) != 0) {
+		vfs_register_for_notification(xmp->nullm_vfs, mp,
+		    &xmp->notify_node);
+	}
+
 	MNT_ILOCK(mp);
 	if ((xmp->nullm_flags & NULLM_CACHE) != 0) {
 		mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag &
@@ -206,13 +212,6 @@ nullfs_mount(struct mount *mp)
 	    (MNTK_USES_BCACHE | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS);
 	MNT_IUNLOCK(mp);
 	vfs_getnewfsid(mp);
-	if ((xmp->nullm_flags & NULLM_CACHE) != 0) {
-		MNT_ILOCK(xmp->nullm_vfs);
-		TAILQ_INSERT_TAIL(&xmp->nullm_vfs->mnt_uppers, mp,
-		    mnt_upper_link);
-		MNT_IUNLOCK(xmp->nullm_vfs);
-	}
-
 	vfs_mountedfrom(mp, target);
 	vput(nullm_rootvp);
 
@@ -230,7 +229,6 @@ nullfs_unmount(mp, mntflags)
 	int mntflags;
 {
 	struct null_mount *mntdata;
-	struct mount *ump;
 	int error, flags;
 
 	NULLFSDEBUG("nullfs_unmount: mp = %p\n", (void *)mp);
@@ -259,17 +257,11 @@ nullfs_unmount(mp, mntflags)
 	 * Finally, throw away the null_mount structure
 	 */
 	mntdata = mp->mnt_data;
-	ump = mntdata->nullm_vfs;
 	if ((mntdata->nullm_flags & NULLM_CACHE) != 0) {
-		MNT_ILOCK(ump);
-		while ((ump->mnt_kern_flag & MNTK_VGONE_UPPER) != 0) {
-			ump->mnt_kern_flag |= MNTK_VGONE_WAITER;
-			msleep(&ump->mnt_uppers, &ump->mnt_mtx, 0, "vgnupw", 0);
-		}
-		TAILQ_REMOVE(&ump->mnt_uppers, mp, mnt_upper_link);
-		MNT_IUNLOCK(ump);
+		vfs_unregister_for_notification(mntdata->nullm_vfs,
+		    &mntdata->notify_node);
 	}
-	vfs_unpin(ump);
+	vfs_unregister_upper(mntdata->nullm_vfs, &mntdata->upper_node);
 	vrele(mntdata->nullm_lowerrootvp);
 	mp->mnt_data = NULL;
 	free(mntdata, M_NULLFSMNT);
diff --git a/sys/fs/unionfs/union.h b/sys/fs/unionfs/union.h
index 64706b2b21a2..96180480dbec 100644
--- a/sys/fs/unionfs/union.h
+++ b/sys/fs/unionfs/union.h
@@ -57,6 +57,8 @@ struct unionfs_mount {
 	struct vnode   *um_lowervp;	/* VREFed once */
 	struct vnode   *um_uppervp;	/* VREFed once */
 	struct vnode   *um_rootvp;	/* ROOT vnode */
+	struct mount_upper_node	um_lower_link;	/* node in lower FS list of uppers */
+	struct mount_upper_node	um_upper_link;	/* node in upper FS list of uppers */
 	unionfs_copymode um_copymode;
 	unionfs_whitemode um_whitemode;
 	uid_t		um_uid;
diff --git a/sys/fs/unionfs/union_vfsops.c b/sys/fs/unionfs/union_vfsops.c
index 96a30f0ae8b5..c17650dedc63 100644
--- a/sys/fs/unionfs/union_vfsops.c
+++ b/sys/fs/unionfs/union_vfsops.c
@@ -292,14 +292,16 @@ unionfs_domount(struct mount *mp)
 		return (error);
 	}
 
-	lowermp = vfs_pin_from_vp(ump->um_lowervp);
-	uppermp = vfs_pin_from_vp(ump->um_uppervp);
+	lowermp = vfs_register_upper_from_vp(ump->um_lowervp, mp,
+	    &ump->um_lower_link);
+	uppermp = vfs_register_upper_from_vp(ump->um_uppervp, mp,
+	    &ump->um_upper_link);
 
 	if (lowermp == NULL || uppermp == NULL) {
 		if (lowermp != NULL)
-			vfs_unpin(lowermp);
+			vfs_unregister_upper(lowermp, &ump->um_lower_link);
 		if (uppermp != NULL)
-			vfs_unpin(uppermp);
+			vfs_unregister_upper(uppermp, &ump->um_upper_link);
 		free(ump, M_UNIONFSMNT);
 		mp->mnt_data = NULL;
 		return (ENOENT);
@@ -357,8 +359,8 @@ unionfs_unmount(struct mount *mp, int mntflags)
 	if (error)
 		return (error);
 
-	vfs_unpin(ump->um_lowervp->v_mount);
-	vfs_unpin(ump->um_uppervp->v_mount);
+	vfs_unregister_upper(ump->um_lowervp->v_mount, &ump->um_lower_link);
+	vfs_unregister_upper(ump->um_uppervp->v_mount, &ump->um_upper_link);
 	free(ump, M_UNIONFSMNT);
 	mp->mnt_data = NULL;
 
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
index 354113eb3277..3c546392b213 100644
--- a/sys/kern/vfs_mount.c
+++ b/sys/kern/vfs_mount.c
@@ -65,6 +65,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/sysctl.h>
 #include <sys/sysent.h>
 #include <sys/systm.h>
+#include <sys/taskqueue.h>
 #include <sys/vnode.h>
 #include <vm/uma.h>
 
@@ -89,6 +90,11 @@ static bool	default_autoro = false;
 SYSCTL_BOOL(_vfs, OID_AUTO, default_autoro, CTLFLAG_RW, &default_autoro, 0,
     "Retry failed r/w mount as r/o if no explicit ro/rw option is specified");
 
+static bool	recursive_forced_unmount = false;
+SYSCTL_BOOL(_vfs, OID_AUTO, recursive_forced_unmount, CTLFLAG_RW,
+    &recursive_forced_unmount, 0, "Recursively unmount stacked upper mounts"
+    " when a file system is forcibly unmounted");
+
 MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount structure");
 MALLOC_DEFINE(M_STATFS, "statfs", "statfs structure");
 static uma_zone_t mount_zone;
@@ -103,6 +109,16 @@ MTX_SYSINIT(mountlist, &mountlist_mtx, "mountlist", MTX_DEF);
 EVENTHANDLER_LIST_DEFINE(vfs_mounted);
 EVENTHANDLER_LIST_DEFINE(vfs_unmounted);
 
+static void vfs_deferred_unmount(void *arg, int pending);
+static struct task deferred_unmount_task =
+    TASK_INITIALIZER(0, vfs_deferred_unmount, NULL);;
+static struct mtx deferred_unmount_lock;
+MTX_SYSINIT(deferred_unmount, &deferred_unmount_lock, "deferred_unmount",
+    MTX_DEF);
+static STAILQ_HEAD(, mount) deferred_unmount_list =
+    STAILQ_HEAD_INITIALIZER(deferred_unmount_list);
+TASKQUEUE_DEFINE_THREAD(deferred_unmount);
+
 static void mount_devctl_event(const char *type, struct mount *mp, bool donew);
 
 /*
@@ -505,8 +521,21 @@ vfs_ref(struct mount *mp)
 	MNT_IUNLOCK(mp);
 }
 
+/*
+ * Register ump as an upper mount of the mount associated with
+ * vnode vp.  This registration will be tracked through
+ * mount_upper_node upper, which should be allocated by the
+ * caller and stored in per-mount data associated with mp.
+ *
+ * If successful, this function will return the mount associated
+ * with vp, and will ensure that it cannot be unmounted until
+ * ump has been unregistered as one of its upper mounts.
+ * 
+ * Upon failure this function will return NULL.
+ */
 struct mount *
-vfs_pin_from_vp(struct vnode *vp)
+vfs_register_upper_from_vp(struct vnode *vp, struct mount *ump,
+    struct mount_upper_node *upper)
 {
 	struct mount *mp;
 
@@ -514,26 +543,81 @@ vfs_pin_from_vp(struct vnode *vp)
 	if (mp == NULL)
 		return (NULL);
 	MNT_ILOCK(mp);
-	if (mp != vp->v_mount || (mp->mnt_kern_flag & MNTK_UNMOUNT) != 0) {
+	if (mp != vp->v_mount ||
+	    ((mp->mnt_kern_flag & (MNTK_UNMOUNT | MNTK_RECURSE)) != 0)) {
 		MNT_IUNLOCK(mp);
 		return (NULL);
 	}
+	KASSERT(ump != mp, ("upper and lower mounts are identical"));
+	upper->mp = ump;
 	MNT_REF(mp);
-	KASSERT(mp->mnt_pinned_count < INT_MAX,
-	    ("mount pinned count overflow"));
-	++mp->mnt_pinned_count;
+	TAILQ_INSERT_TAIL(&mp->mnt_uppers, upper, mnt_upper_link);
 	MNT_IUNLOCK(mp);
 	return (mp);
 }
 
+/*
+ * Register upper mount ump to receive vnode unlink/reclaim
+ * notifications from lower mount mp. This registration will
+ * be tracked through mount_upper_node upper, which should be
+ * allocated by the caller and stored in per-mount data
+ * associated with mp.
+ *
+ * ump must already be registered as an upper mount of mp
+ * through a call to vfs_register_upper_from_vp().
+ */
 void
-vfs_unpin(struct mount *mp)
+vfs_register_for_notification(struct mount *mp, struct mount *ump,
+    struct mount_upper_node *upper)
+{
+	upper->mp = ump;
+	MNT_ILOCK(mp);
+	TAILQ_INSERT_TAIL(&mp->mnt_notify, upper, mnt_upper_link);
+	MNT_IUNLOCK(mp);
+}
+
+static void
+vfs_drain_upper_locked(struct mount *mp)
+{
+	mtx_assert(MNT_MTX(mp), MA_OWNED);
+	while (mp->mnt_upper_pending != 0) {
+		mp->mnt_kern_flag |= MNTK_UPPER_WAITER;
+		msleep(&mp->mnt_uppers, MNT_MTX(mp), 0, "mntupw", 0);
+	}
+}
+
+/*
+ * Undo a previous call to vfs_register_for_notification().
+ * The mount represented by upper must be currently registered
+ * as an upper mount for mp.
+ */
+void
+vfs_unregister_for_notification(struct mount *mp,
+    struct mount_upper_node *upper)
+{
+	MNT_ILOCK(mp);
+	vfs_drain_upper_locked(mp);
+	TAILQ_REMOVE(&mp->mnt_notify, upper, mnt_upper_link);
+	MNT_IUNLOCK(mp);
+}
+
+/*
+ * Undo a previous call to vfs_register_upper_from_vp().
+ * This must be done before mp can be unmounted.
+ */
+void
+vfs_unregister_upper(struct mount *mp, struct mount_upper_node *upper)
 {
 	MNT_ILOCK(mp);
-	KASSERT(mp->mnt_pinned_count > 0, ("mount pinned count underflow"));
 	KASSERT((mp->mnt_kern_flag & MNTK_UNMOUNT) == 0,
-	    ("mount pinned with pending unmount"));
-	--mp->mnt_pinned_count;
+	    ("registered upper with pending unmount"));
+	vfs_drain_upper_locked(mp);
+	TAILQ_REMOVE(&mp->mnt_uppers, upper, mnt_upper_link);
+	if ((mp->mnt_kern_flag & MNTK_TASKQUEUE_WAITER) != 0 &&
+	    TAILQ_EMPTY(&mp->mnt_uppers)) {
+		mp->mnt_kern_flag &= ~MNTK_TASKQUEUE_WAITER;
+		wakeup(&mp->mnt_taskqueue_link);
+	}
 	MNT_REL(mp);
 	MNT_IUNLOCK(mp);
 }
@@ -600,8 +684,10 @@ vfs_mount_alloc(struct vnode *vp, struct vfsconf *vfsp, const char *fspath,
 	mac_mount_create(cred, mp);
 #endif
 	arc4rand(&mp->mnt_hashseed, sizeof mp->mnt_hashseed, 0);
+	mp->mnt_upper_pending = 0;
 	TAILQ_INIT(&mp->mnt_uppers);
-	mp->mnt_pinned_count = 0;
+	TAILQ_INIT(&mp->mnt_notify);
+	mp->mnt_taskqueue_flags = 0;
 	return (mp);
 }
 
@@ -640,9 +726,9 @@ vfs_mount_destroy(struct mount *mp)
 			vn_printf(vp, "dangling vnode ");
 		panic("unmount: dangling vnode");
 	}
-	KASSERT(mp->mnt_pinned_count == 0,
-	   ("mnt_pinned_count = %d", mp->mnt_pinned_count));
+	KASSERT(mp->mnt_upper_pending == 0, ("mnt_upper_pending"));
 	KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers"));
+	KASSERT(TAILQ_EMPTY(&mp->mnt_notify), ("mnt_notify"));
 	if (mp->mnt_nvnodelistsize != 0)
 		panic("vfs_mount_destroy: nonzero nvnodelistsize");
 	if (mp->mnt_lazyvnodelistsize != 0)
@@ -1799,17 +1885,166 @@ vfs_mount_fetch_counter(struct mount *mp, enum mount_counter which)
 	return (sum);
 }
 
+static bool
+deferred_unmount_enqueue(struct mount *mp, uint64_t flags, bool requeue)
+{
+	bool enqueued;
+
+	enqueued = false;
+	mtx_lock(&deferred_unmount_lock);
+	if ((mp->mnt_taskqueue_flags & MNT_DEFERRED) == 0 || requeue) {
+		mp->mnt_taskqueue_flags = flags | MNT_DEFERRED;
+		STAILQ_INSERT_TAIL(&deferred_unmount_list, mp,
+		    mnt_taskqueue_link);
+		enqueued = true;
+	}
+	mtx_unlock(&deferred_unmount_lock);
+
+	if (enqueued) {
+		taskqueue_enqueue(taskqueue_deferred_unmount,
+		    &deferred_unmount_task);
+	}
+
+	return (enqueued);
+}
+
+/*
+ * Taskqueue handler for processing async/recursive unmounts
+ */
+static void
+vfs_deferred_unmount(void *argi __unused, int pending __unused)
+{
+	STAILQ_HEAD(, mount) local_unmounts;
+	uint64_t flags;
+	struct mount *mp, *tmp;
+	bool unmounted;
+
+	STAILQ_INIT(&local_unmounts);
+	mtx_lock(&deferred_unmount_lock);
+	STAILQ_CONCAT(&local_unmounts, &deferred_unmount_list); 
+	mtx_unlock(&deferred_unmount_lock);
+
+	STAILQ_FOREACH_SAFE(mp, &local_unmounts, mnt_taskqueue_link, tmp) {
+		flags = mp->mnt_taskqueue_flags;
+		KASSERT((flags & MNT_DEFERRED) != 0,
+		    ("taskqueue unmount without MNT_DEFERRED"));
+		if (dounmount(mp, flags, curthread) != 0) {
+			MNT_ILOCK(mp);
+			unmounted = ((mp->mnt_kern_flag & MNTK_REFEXPIRE) != 0);
+			MNT_IUNLOCK(mp);
+			if (!unmounted)
+				deferred_unmount_enqueue(mp, flags, true);
+			else
+				vfs_rel(mp);
+		}
+	}
+}
+
 /*
  * Do the actual filesystem unmount.
  */
 int
-dounmount(struct mount *mp, int flags, struct thread *td)
+dounmount(struct mount *mp, uint64_t flags, struct thread *td)
 {
+	struct mount_upper_node *upper;
 	struct vnode *coveredvp, *rootvp;
 	int error;
 	uint64_t async_flag;
 	int mnt_gen_r;
 
+	KASSERT((flags & MNT_DEFERRED) == 0 ||
+	    (flags & (MNT_RECURSE | MNT_FORCE)) == (MNT_RECURSE | MNT_FORCE),
+	    ("MNT_DEFERRED requires MNT_RECURSE | MNT_FORCE"));
+
+	/*
+	 * If the caller has explicitly requested the unmount to be handled by
+	 * the taskqueue and we're not already in taskqueue context, queue
+	 * up the unmount request and exit.  This is done prior to any
+	 * credential checks; MNT_DEFERRED should be used only for kernel-
+	 * initiated unmounts and will therefore be processed with the
+	 * (kernel) credentials of the taskqueue thread.  Still, callers
+	 * should be sure this is the behavior they want.
+	 */
+	if ((flags & MNT_DEFERRED) != 0 &&
+	    taskqueue_member(taskqueue_deferred_unmount, curthread) == 0) {
+		if (!deferred_unmount_enqueue(mp, flags, false))
+			vfs_rel(mp);
+		return (EINPROGRESS);
+	}
+
+	/*
+	 * Only privileged root, or (if MNT_USER is set) the user that did the
+	 * original mount is permitted to unmount this filesystem.
+	 * This check should be made prior to queueing up any recursive
+	 * unmounts of upper filesystems.  Those unmounts will be executed
+	 * with kernel thread credentials and are expected to succeed, so
+	 * we must at least ensure the originating context has sufficient
+	 * privilege to unmount the base filesystem before proceeding with
+	 * the uppers.
+	 */
+	error = vfs_suser(mp, td);
+	if (error != 0) {
+		KASSERT((flags & MNT_DEFERRED) == 0,
+		    ("taskqueue unmount with insufficient privilege"));
+		vfs_rel(mp);
+		return (error);
+	}
+
+	if (recursive_forced_unmount && ((flags & MNT_FORCE) != 0))
+		flags |= MNT_RECURSE;
+
+	if ((flags & MNT_RECURSE) != 0) {
+		KASSERT((flags & MNT_FORCE) != 0,
+		    ("MNT_RECURSE requires MNT_FORCE"));
+
+		MNT_ILOCK(mp);
+		/*
+		 * Set MNTK_RECURSE to prevent new upper mounts from being
+		 * added, and note that an operation on the uppers list is in
+		 * progress.  This will ensure that unregistration from the
+		 * uppers list, and therefore any pending unmount of the upper
+		 * FS, can't complete until after we finish walking the list.
+		 */
+		mp->mnt_kern_flag |= MNTK_RECURSE;
+		mp->mnt_upper_pending++;
+		TAILQ_FOREACH(upper, &mp->mnt_uppers, mnt_upper_link) {
+			MNT_IUNLOCK(mp);
+			vfs_ref(upper->mp);
+			if (!deferred_unmount_enqueue(upper->mp, flags, false))
+				vfs_rel(upper->mp);
+			MNT_ILOCK(mp);
+		}
+		mp->mnt_upper_pending--;
+		if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&
+		    mp->mnt_upper_pending == 0) {
+			mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;
+			wakeup(&mp->mnt_uppers);
+		}
+		/*
+		 * If we're not on the taskqueue, wait until the uppers list
+		 * is drained before proceeding with unmount.  Otherwise, if
+		 * we are on the taskqueue and there are still pending uppers,
+		 * just re-enqueue on the end of the taskqueue.
+		 */
+		if ((flags & MNT_DEFERRED) == 0) {
+			while (!TAILQ_EMPTY(&mp->mnt_uppers)) {
+				mp->mnt_kern_flag |= MNTK_TASKQUEUE_WAITER;
+				msleep(&mp->mnt_taskqueue_link, MNT_MTX(mp), 0,
+				    "umntqw", 0);
+			}
+		} else if (!TAILQ_EMPTY(&mp->mnt_uppers)) {
+			MNT_IUNLOCK(mp);
+			deferred_unmount_enqueue(mp, flags, true);
+			return (0);
+		}
+		MNT_IUNLOCK(mp);
+		KASSERT(TAILQ_EMPTY(&mp->mnt_uppers), ("mnt_uppers not empty"));
+	}
+
+	/* Allow the taskqueue to safely re-enqueue on failure */
+	if ((flags & MNT_DEFERRED) != 0)
+		vfs_ref(mp);
+
 	if ((coveredvp = mp->mnt_vnodecovered) != NULL) {
 		mnt_gen_r = mp->mnt_gen;
 		VI_LOCK(coveredvp);
@@ -1828,27 +2063,13 @@ dounmount(struct mount *mp, int flags, struct thread *td)
 		}
 	}
 
-	/*
-	 * Only privileged root, or (if MNT_USER is set) the user that did the
-	 * original mount is permitted to unmount this filesystem.
-	 */
-	error = vfs_suser(mp, td);
-	if (error != 0) {
-		if (coveredvp != NULL) {
-			VOP_UNLOCK(coveredvp);
-			vdrop(coveredvp);
-		}
-		vfs_rel(mp);
-		return (error);
-	}
-
 	vfs_op_enter(mp);
 
 	vn_start_write(NULL, &mp, V_WAIT | V_MNTREF);
 	MNT_ILOCK(mp);
 	if ((mp->mnt_kern_flag & MNTK_UNMOUNT) != 0 ||
 	    (mp->mnt_flag & MNT_UPDATE) != 0 ||
-	    mp->mnt_pinned_count != 0) {
+	    !TAILQ_EMPTY(&mp->mnt_uppers)) {
 		dounmount_cleanup(mp, coveredvp, 0);
 		return (EBUSY);
 	}
@@ -1952,6 +2173,7 @@ dounmount(struct mount *mp, int flags, struct thread *td)
 		}
 		return (error);
 	}
+
 	mtx_lock(&mountlist_mtx);
 	TAILQ_REMOVE(&mountlist, mp, mnt_list);
 	mtx_unlock(&mountlist_mtx);
@@ -1977,6 +2199,8 @@ dounmount(struct mount *mp, int flags, struct thread *td)
 	}
 	if (mp == rootdevmp)
 		rootdevmp = NULL;
+	if ((flags & MNT_DEFERRED) != 0)
+		vfs_rel(mp);
 	vfs_mount_destroy(mp);
 	return (0);
 }
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index a2f25bf78495..8add6951645f 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -831,9 +831,9 @@ vfs_busy(struct mount *mp, int flags)
 	 * valid.
 	 */
 	while (mp->mnt_kern_flag & MNTK_UNMOUNT) {
-		KASSERT(mp->mnt_pinned_count == 0,
-		    ("%s: non-zero pinned count %d with pending unmount",
-		    __func__, mp->mnt_pinned_count));
+		KASSERT(TAILQ_EMPTY(&mp->mnt_uppers),
+		    ("%s: non-empty upper mount list with pending unmount",
+		    __func__));
 		if (flags & MBF_NOWAIT || mp->mnt_kern_flag & MNTK_REFEXPIRE) {
 			MNT_REL(mp);
 			MNT_IUNLOCK(mp);
@@ -3897,6 +3897,11 @@ notify_lowervp_vfs_dummy(struct mount *mp __unused,
 {
 }
 
+struct notify_mount {
+	struct mount mp;
+	struct mount_upper_node upper;
+};
+
 /*
  * Notify upper mounts about reclaimed or unlinked vnode.
  */
@@ -3907,45 +3912,52 @@ vfs_notify_upper(struct vnode *vp, int event)
 		.vfs_reclaim_lowervp = notify_lowervp_vfs_dummy,
 		.vfs_unlink_lowervp = notify_lowervp_vfs_dummy,
 	};
-	struct mount *mp, *ump, *mmp;
+	struct mount *mp;
+	struct mount_upper_node *ump;
+	struct notify_mount *mmp;
 
 	mp = vp->v_mount;
 	if (mp == NULL)
 		return;
-	if (TAILQ_EMPTY(&mp->mnt_uppers))
+	if (TAILQ_EMPTY(&mp->mnt_notify))
 		return;
 
-	mmp = malloc(sizeof(struct mount), M_TEMP, M_WAITOK | M_ZERO);
-	mmp->mnt_op = &vgonel_vfsops;
-	mmp->mnt_kern_flag |= MNTK_MARKER;
+	mmp = malloc(sizeof(*mmp), M_TEMP, M_WAITOK | M_ZERO);
+	mmp->mp.mnt_op = &vgonel_vfsops;
+	mmp->mp.mnt_kern_flag |= MNTK_MARKER;
+	mmp->upper.mp = &mmp->mp;
 	MNT_ILOCK(mp);
-	mp->mnt_kern_flag |= MNTK_VGONE_UPPER;
-	for (ump = TAILQ_FIRST(&mp->mnt_uppers); ump != NULL;) {
-		if ((ump->mnt_kern_flag & MNTK_MARKER) != 0) {
+	mp->mnt_upper_pending++;
+	KASSERT(mp->mnt_upper_pending > 0,
+	    ("%s: mnt_upper_pending %d", __func__, mp->mnt_upper_pending));
+	for (ump = TAILQ_FIRST(&mp->mnt_notify); ump != NULL;) {
+		if ((ump->mp->mnt_kern_flag & MNTK_MARKER) != 0) {
 			ump = TAILQ_NEXT(ump, mnt_upper_link);
 			continue;
 		}
-		TAILQ_INSERT_AFTER(&mp->mnt_uppers, ump, mmp, mnt_upper_link);
+		TAILQ_INSERT_AFTER(&mp->mnt_notify, ump, &mmp->upper,
+		    mnt_upper_link);
 		MNT_IUNLOCK(mp);
 		switch (event) {
 		case VFS_NOTIFY_UPPER_RECLAIM:
-			VFS_RECLAIM_LOWERVP(ump, vp);
+			VFS_RECLAIM_LOWERVP(ump->mp, vp);
 			break;
 		case VFS_NOTIFY_UPPER_UNLINK:
-			VFS_UNLINK_LOWERVP(ump, vp);
+			VFS_UNLINK_LOWERVP(ump->mp, vp);
 			break;
 		default:
 			KASSERT(0, ("invalid event %d", event));
 			break;
 		}
 		MNT_ILOCK(mp);
-		ump = TAILQ_NEXT(mmp, mnt_upper_link);
-		TAILQ_REMOVE(&mp->mnt_uppers, mmp, mnt_upper_link);
+		ump = TAILQ_NEXT(&mmp->upper, mnt_upper_link);
+		TAILQ_REMOVE(&mp->mnt_notify, &mmp->upper, mnt_upper_link);
 	}
 	free(mmp, M_TEMP);
-	mp->mnt_kern_flag &= ~MNTK_VGONE_UPPER;
-	if ((mp->mnt_kern_flag & MNTK_VGONE_WAITER) != 0) {
-		mp->mnt_kern_flag &= ~MNTK_VGONE_WAITER;
+	mp->mnt_upper_pending--;
+	if ((mp->mnt_kern_flag & MNTK_UPPER_WAITER) != 0 &&
+	    mp->mnt_upper_pending == 0) {
+		mp->mnt_kern_flag &= ~MNTK_UPPER_WAITER;
 		wakeup(&mp->mnt_uppers);
 	}
 	MNT_IUNLOCK(mp);
@@ -4376,12 +4388,13 @@ DB_SHOW_COMMAND(mount, db_show_mount)
 	MNT_KERN_FLAG(MNTK_EXTENDED_SHARED);
 	MNT_KERN_FLAG(MNTK_SHARED_WRITES);
 	MNT_KERN_FLAG(MNTK_NO_IOPF);
-	MNT_KERN_FLAG(MNTK_VGONE_UPPER);
-	MNT_KERN_FLAG(MNTK_VGONE_WAITER);
+	MNT_KERN_FLAG(MNTK_RECURSE);
+	MNT_KERN_FLAG(MNTK_UPPER_WAITER);
 	MNT_KERN_FLAG(MNTK_LOOKUP_EXCL_DOTDOT);
 	MNT_KERN_FLAG(MNTK_MARKER);
 	MNT_KERN_FLAG(MNTK_USES_BCACHE);
 	MNT_KERN_FLAG(MNTK_FPLOOKUP);
+	MNT_KERN_FLAG(MNTK_TASKQUEUE_WAITER);
 	MNT_KERN_FLAG(MNTK_NOASYNC);
 	MNT_KERN_FLAG(MNTK_UNMOUNT);
 	MNT_KERN_FLAG(MNTK_MWAIT);
diff --git a/sys/sys/mount.h b/sys/sys/mount.h
index 693293b12370..2082ff089d69 100644
--- a/sys/sys/mount.h
+++ b/sys/sys/mount.h
@@ -190,6 +190,19 @@ struct mount_pcpu {
 _Static_assert(sizeof(struct mount_pcpu) == 16,
     "the struct is allocated from pcpu 16 zone");
 
+/*
+ * Structure for tracking a stacked filesystem mounted above another
+ * filesystem.  This is expected to be stored in the upper FS' per-mount data.
+ *
+ * Lock reference:
+ *	i - lower mount interlock
+ *	c - constant from node initialization
+ */
+struct mount_upper_node {
+	struct mount 	*mp;	/* (c) mount object for upper FS */
+	TAILQ_ENTRY(mount_upper_node) mnt_upper_link;	/* (i) position in uppers list */
+};
+
 /*
  * Structure per mounted filesystem.  Each mounted filesystem has an
  * array of operations and an instance record.  The filesystems are
@@ -199,8 +212,8 @@ _Static_assert(sizeof(struct mount_pcpu) == 16,
  * 	l - mnt_listmtx
  *	m - mountlist_mtx
  *	i - interlock
- *	i* - interlock of uppers' list head
  *	v - vnode freelist mutex
+ *	d - deferred unmount list mutex
  *
  * Unmarked fields are considered stable as long as a ref is held.
  *
@@ -242,10 +255,12 @@ struct mount {
 	struct mtx	mnt_listmtx;
 	struct vnodelst	mnt_lazyvnodelist;	/* (l) list of lazy vnodes */
 	int		mnt_lazyvnodelistsize;	/* (l) # of lazy vnodes */
-	int		mnt_pinned_count;	/* (i) unmount prevented */
+	int		mnt_upper_pending;	/* (i) # of pending ops on mnt_uppers */
 	struct lock	mnt_explock;		/* vfs_export walkers lock */
-	TAILQ_ENTRY(mount) mnt_upper_link;	/* (i*) we in the all uppers */
-	TAILQ_HEAD(, mount) mnt_uppers;		/* (i) upper mounts over us */
+	TAILQ_HEAD(, mount_upper_node) mnt_uppers; /* (i) upper mounts over us */
+	TAILQ_HEAD(, mount_upper_node) mnt_notify; /* (i) upper mounts for notification */
+	STAILQ_ENTRY(mount) mnt_taskqueue_link;	/* (d) our place in deferred unmount list */
+	uint64_t	mnt_taskqueue_flags;	/* (d) unmount flags passed from taskqueue */
 };
 #endif	/* _WANT_MOUNT || _KERNEL */
 
@@ -438,9 +453,13 @@ struct mntoptnames {
 #define	MNT_BYFSID	0x0000000008000000ULL /* specify filesystem by ID. */
 #define	MNT_NOCOVER	0x0000001000000000ULL /* Do not cover a mount point */
 #define	MNT_EMPTYDIR	0x0000002000000000ULL /* Only mount on empty dir */
-#define MNT_CMDFLAGS   (MNT_UPDATE	| MNT_DELEXPORT	| MNT_RELOAD	| \
+#define	MNT_RECURSE	0x0000100000000000ULL /* recursively unmount uppers */
+#define	MNT_DEFERRED    0x0000200000000000ULL /* unmount in async context */
+#define	MNT_CMDFLAGS   (MNT_UPDATE	| MNT_DELEXPORT	| MNT_RELOAD	| \
 			MNT_FORCE	| MNT_SNAPSHOT	| MNT_NONBUSY	| \
-			MNT_BYFSID	| MNT_NOCOVER	| MNT_EMPTYDIR)
+			MNT_BYFSID	| MNT_NOCOVER	| MNT_EMPTYDIR	| \
+			MNT_RECURSE	| MNT_DEFERRED)
+
 /*
  * Internal filesystem control flags stored in mnt_kern_flag.
  *
@@ -466,8 +485,8 @@ struct mntoptnames {
 #define	MNTK_NO_IOPF	0x00000100	/* Disallow page faults during reads
 					   and writes. Filesystem shall properly
 					   handle i/o state on EFAULT. */
-#define	MNTK_VGONE_UPPER	0x00000200
-#define	MNTK_VGONE_WAITER	0x00000400
+#define	MNTK_RECURSE		0x00000200 /* pending recursive unmount */
+#define	MNTK_UPPER_WAITER	0x00000400 /* waiting to drain MNTK_UPPER_PENDING */
 #define	MNTK_LOOKUP_EXCL_DOTDOT	0x00000800
 #define	MNTK_MARKER		0x00001000
 #define	MNTK_UNMAPPED_BUFS	0x00002000
@@ -477,8 +496,9 @@ struct mntoptnames {
 #define	MNTK_UNIONFS	0x00020000	/* A hack for F_ISUNIONSTACK */
 #define	MNTK_FPLOOKUP	0x00040000	/* fast path lookup is supported */
 #define	MNTK_SUSPEND_ALL	0x00080000 /* Suspended by all-fs suspension */
-#define MNTK_NOASYNC	0x00800000	/* disable async */
-#define MNTK_UNMOUNT	0x01000000	/* unmount in progress */
+#define	MNTK_TASKQUEUE_WAITER	0x00100000 /* Waiting on unmount taskqueue */
+#define	MNTK_NOASYNC	0x00800000	/* disable async */
+#define	MNTK_UNMOUNT	0x01000000	/* unmount in progress */
 #define	MNTK_MWAIT	0x02000000	/* waiting for unmount to finish */
 #define	MNTK_SUSPEND	0x08000000	/* request write suspension */
 #define	MNTK_SUSPEND2	0x04000000	/* block secondary writes */
@@ -952,7 +972,7 @@ vfs_statfs_t	__vfs_statfs;
  * exported vnode operations
  */
 
-int	dounmount(struct mount *, int, struct thread *);
+int	dounmount(struct mount *, uint64_t, struct thread *);
 
 int	kernel_mount(struct mntarg *ma, uint64_t flags);
 int	kernel_vmount(int flags, ...);
@@ -1012,8 +1032,13 @@ struct mount *vfs_mount_alloc(struct vnode *, struct vfsconf *, const char *,
 int	vfs_suser(struct mount *, struct thread *);
 void	vfs_unbusy(struct mount *);
 void	vfs_unmountall(void);
-struct mount *vfs_pin_from_vp(struct vnode *);
-void	vfs_unpin(struct mount *);
+struct mount *vfs_register_upper_from_vp(struct vnode *,
+	    struct mount *ump, struct mount_upper_node *);
+void	vfs_register_for_notification(struct mount *, struct mount *,
+	    struct mount_upper_node *);
+void	vfs_unregister_for_notification(struct mount *,
+	    struct mount_upper_node *);
+void	vfs_unregister_upper(struct mount *, struct mount_upper_node *);
 extern	TAILQ_HEAD(mntlist, mount) mountlist;	/* mounted filesystem list */
 extern	struct mtx_padalign mountlist_mtx;
 extern	struct nfs_public nfs_pub;
diff --git a/sys/ufs/ffs/ffs_vfsops.c b/sys/ufs/ffs/ffs_vfsops.c
index 6b7407eb88f9..689c85d7bb1f 100644
--- a/sys/ufs/ffs/ffs_vfsops.c
+++ b/sys/ufs/ffs/ffs_vfsops.c
@@ -297,7 +297,7 @@ ffs_fsfail_unmount(void *v, int pending)
 	 */
 	mp = vfs_getvfs(&etp->fsid);
 	if (mp != NULL)
-		dounmount(mp, MNT_FORCE, curthread);
+		dounmount(mp, MNT_FORCE | MNT_RECURSE, curthread);
 	free(etp, M_UFSMNT);
 }
 


More information about the dev-commits-src-main mailing list