git: aeabf8d4b972 - main - nullfs: hash insertion without vnode lock upgrade

From: Mateusz Guzik <mjg_at_FreeBSD.org>
Date: Sat, 19 Mar 2022 10:50:34 UTC
The branch main has been updated by mjg:

URL: https://cgit.FreeBSD.org/src/commit/?id=aeabf8d4b9727201648887f8aa7dd1930cf0e154

commit aeabf8d4b9727201648887f8aa7dd1930cf0e154
Author:     Mateusz Guzik <mjg@FreeBSD.org>
AuthorDate: 2022-03-07 11:43:43 +0000
Commit:     Mateusz Guzik <mjg@FreeBSD.org>
CommitDate: 2022-03-19 10:47:10 +0000

    nullfs: hash insertion without vnode lock upgrade
    
    Use the hash lock to serialize instead.
    
    This enables shared-locked ".." lookups.
    
    Reviewed by:    markj
    Tested by:      pho (previous version)
    Differential Revision:  https://reviews.freebsd.org/D34466
---
 sys/fs/nullfs/null_subr.c   | 114 ++++++++++++++++++++------------------------
 sys/fs/nullfs/null_vfsops.c |   2 +-
 2 files changed, 53 insertions(+), 63 deletions(-)

diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c
index 6b422410b9ec..b2b6e6837b2d 100644
--- a/sys/fs/nullfs/null_subr.c
+++ b/sys/fs/nullfs/null_subr.c
@@ -65,7 +65,7 @@ static u_long null_hash_mask;
 static MALLOC_DEFINE(M_NULLFSHASH, "nullfs_hash", "NULLFS hash table");
 MALLOC_DEFINE(M_NULLFSNODE, "nullfs_node", "NULLFS vnode private part");
 
-static struct vnode * null_hashins(struct mount *, struct null_node *);
+static void null_hashins(struct mount *, struct null_node *);
 
 /*
  * Initialise cache headers
@@ -93,14 +93,15 @@ nullfs_uninit(struct vfsconf *vfsp)
  * Return a VREF'ed alias for lower vnode if already exists, else 0.
  * Lower vnode should be locked on entry and will be left locked on exit.
  */
-struct vnode *
-null_hashget(struct mount *mp, struct vnode *lowervp)
+static struct vnode *
+null_hashget_locked(struct mount *mp, struct vnode *lowervp)
 {
 	struct null_node_hashhead *hd;
 	struct null_node *a;
 	struct vnode *vp;
 
 	ASSERT_VOP_LOCKED(lowervp, "null_hashget");
+	rw_assert(&null_hash_lock, RA_LOCKED);
 
 	/*
 	 * Find hash base, and then search the (two-way) linked
@@ -109,9 +110,6 @@ null_hashget(struct mount *mp, struct vnode *lowervp)
 	 * reference count (but NOT the lower vnode's VREF counter).
 	 */
 	hd = NULL_NHASH(lowervp);
-	if (LIST_EMPTY(hd))
-		return (NULLVP);
-	rw_rlock(&null_hash_lock);
 	LIST_FOREACH(a, hd, null_hash) {
 		if (a->null_lowervp == lowervp && NULLTOV(a)->v_mount == mp) {
 			/*
@@ -122,43 +120,50 @@ null_hashget(struct mount *mp, struct vnode *lowervp)
 			 */
 			vp = NULLTOV(a);
 			vref(vp);
-			rw_runlock(&null_hash_lock);
 			return (vp);
 		}
 	}
-	rw_runlock(&null_hash_lock);
 	return (NULLVP);
 }
 
-/*
- * Act like null_hashget, but add passed null_node to hash if no existing
- * node found.
- */
-static struct vnode *
+struct vnode *
+null_hashget(struct mount *mp, struct vnode *lowervp)
+{
+	struct null_node_hashhead *hd;
+	struct vnode *vp;
+
+	hd = NULL_NHASH(lowervp);
+	if (LIST_EMPTY(hd))
+		return (NULLVP);
+
+	rw_rlock(&null_hash_lock);
+	vp = null_hashget_locked(mp, lowervp);
+	rw_runlock(&null_hash_lock);
+
+	return (vp);
+}
+
+static void
 null_hashins(struct mount *mp, struct null_node *xp)
 {
 	struct null_node_hashhead *hd;
+#ifdef INVARIANTS
 	struct null_node *oxp;
-	struct vnode *ovp;
+#endif
+
+	rw_assert(&null_hash_lock, RA_WLOCKED);
 
 	hd = NULL_NHASH(xp->null_lowervp);
-	rw_wlock(&null_hash_lock);
+#ifdef INVARIANTS
 	LIST_FOREACH(oxp, hd, null_hash) {
 		if (oxp->null_lowervp == xp->null_lowervp &&
 		    NULLTOV(oxp)->v_mount == mp) {
-			/*
-			 * See null_hashget for a description of this
-			 * operation.
-			 */
-			ovp = NULLTOV(oxp);
-			vref(ovp);
-			rw_wunlock(&null_hash_lock);
-			return (ovp);
+			VNASSERT(0, NULLTOV(oxp),
+			    ("vnode already in hash"));
 		}
 	}
+#endif
 	LIST_INSERT_HEAD(hd, xp, null_hash);
-	rw_wunlock(&null_hash_lock);
-	return (NULLVP);
 }
 
 static void
@@ -201,19 +206,6 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp)
 		return (0);
 	}
 
-	/*
-	 * The insmntque1() call below requires the exclusive lock on
-	 * the nullfs vnode.  Upgrade the lock now if hash failed to
-	 * provide ready to use vnode.
-	 */
-	if (VOP_ISLOCKED(lowervp) != LK_EXCLUSIVE) {
-		vn_lock(lowervp, LK_UPGRADE | LK_RETRY);
-		if (VN_IS_DOOMED(lowervp)) {
-			vput(lowervp);
-			return (ENOENT);
-		}
-	}
-
 	/*
 	 * We do not serialize vnode creation, instead we will check for
 	 * duplicates later, when adding new vnode to hash.
@@ -229,20 +221,23 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp)
 		return (error);
 	}
 
+	VNPASS(vp->v_object == NULL, vp);
+	VNPASS((vn_irflag_read(vp) & VIRF_PGREAD) == 0, vp);
+
+	rw_wlock(&null_hash_lock);
 	xp->null_vnode = vp;
 	xp->null_lowervp = lowervp;
 	xp->null_flags = 0;
 	vp->v_type = lowervp->v_type;
 	vp->v_data = xp;
 	vp->v_vnlock = lowervp->v_vnlock;
-	error = insmntque1(vp, mp);
-	if (error != 0) {
-		vput(lowervp);
+	*vpp = null_hashget_locked(mp, lowervp);
+	if (*vpp != NULL) {
+		rw_wunlock(&null_hash_lock);
+		vrele(lowervp);
 		null_destroy_proto(vp, xp);
-		return (error);
+		return (0);
 	}
-	if (lowervp == MOUNTTONULLMOUNT(mp)->nullm_lowerrootvp)
-		vp->v_vflag |= VV_ROOT;
 
 	/*
 	 * We might miss the case where lower vnode sets VIRF_PGREAD
@@ -251,28 +246,23 @@ null_nodeget(struct mount *mp, struct vnode *lowervp, struct vnode **vpp)
 	 */
 	if ((vn_irflag_read(lowervp) & VIRF_PGREAD) != 0) {
 		MPASS(lowervp->v_object != NULL);
-		if ((vn_irflag_read(vp) & VIRF_PGREAD) == 0) {
-			if (vp->v_object == NULL)
-				vp->v_object = lowervp->v_object;
-			else
-				MPASS(vp->v_object == lowervp->v_object);
-			vn_irflag_set_cond(vp, VIRF_PGREAD);
-		} else {
-			MPASS(vp->v_object != NULL);
-		}
+		vp->v_object = lowervp->v_object;
+		vn_irflag_set(vp, VIRF_PGREAD);
 	}
+	if (lowervp == MOUNTTONULLMOUNT(mp)->nullm_lowerrootvp)
+		vp->v_vflag |= VV_ROOT;
 
-	/*
-	 * Atomically insert our new node into the hash or vget existing 
-	 * if someone else has beaten us to it.
-	 */
-	*vpp = null_hashins(mp, xp);
-	if (*vpp != NULL) {
-		vrele(lowervp);
-		vp->v_object = NULL;	/* in case VIRF_PGREAD set it */
+	error = insmntque1(vp, mp);
+	if (error != 0) {
+		rw_wunlock(&null_hash_lock);
+		vput(lowervp);
+		vp->v_object = NULL;
 		null_destroy_proto(vp, xp);
-		return (0);
+		return (error);
 	}
+
+	null_hashins(mp, xp);
+	rw_wunlock(&null_hash_lock);
 	*vpp = vp;
 
 	return (0);
diff --git a/sys/fs/nullfs/null_vfsops.c b/sys/fs/nullfs/null_vfsops.c
index 1dba5f51d619..21a28dd45570 100644
--- a/sys/fs/nullfs/null_vfsops.c
+++ b/sys/fs/nullfs/null_vfsops.c
@@ -207,7 +207,7 @@ nullfs_mount(struct mount *mp)
 		    (MNTK_SHARED_WRITES | MNTK_LOOKUP_SHARED |
 		    MNTK_EXTENDED_SHARED);
 	}
-	mp->mnt_kern_flag |= MNTK_LOOKUP_EXCL_DOTDOT | MNTK_NOMSYNC;
+	mp->mnt_kern_flag |= MNTK_NOMSYNC | MNTK_UNLOCKED_INSMNTQUE;
 	mp->mnt_kern_flag |= lowerrootvp->v_mount->mnt_kern_flag &
 	    (MNTK_USES_BCACHE | MNTK_NO_IOPF | MNTK_UNMAPPED_BUFS);
 	MNT_IUNLOCK(mp);