VOP_VPTOCNP() interface change

Kostik Belousov kostikbel at gmail.com
Wed Nov 16 10:25:31 UTC 2011


Apparently, the VOP_VPTOCNP() interface has a fatal flaw that only
matter for nullfs vnodes.  The problem is that resulting vnode is only
required to be held on return from the successfull call to vop,
instead of being referenced. I designed the interface this way because
dropping the reference might need to take the vnode lock, which is
highly inconvenient in the main loop of vn_fullpath(), which holds the
namecache rwlock for read.

Nullfs VOP_INACTIVE() method reclaims the vnode, which in combination
with the VOP_VPTOCNP() interface means that the directory vnode
returned from VOP_VPTOCNP() is reclaimed in advance, causing
vn_fullpath() to error with EBADF or like.

It appeared that my worries about dropping the cache lock were
unfounded. Below is the patch that changes the requirements for
VOP_VPTOCNP(), now the dvp must be referenced. I converted all in-tree
implementations of VOP_VPTOCNP(), it appeared to be trivial, which is
expected, because vhold(9) and vref(9) are similar in the locking. So
I do not expect much troubles for out-of-tree fs implementation of
VOP_VPTOCNP, if any.

Below is the patch, it was tested by Peter Holm.  Among other issues,
I believe that it should fix some JVM errors when JVM is run from the
nullfs-mounted fs.

Please comment, I will commit the patch in a week.

diff --git a/share/man/man9/VOP_VPTOCNP.9 b/share/man/man9/VOP_VPTOCNP.9
index 6bcbd25..671c8df 100644
--- a/share/man/man9/VOP_VPTOCNP.9
+++ b/share/man/man9/VOP_VPTOCNP.9
@@ -65,9 +65,9 @@ is not a directory, then
 .Nm
 returns ENOENT.
 .Sh LOCKS
-The vnode should be locked on entry and will still be locked on exit.  The
-parent directory vnode will be unlocked on a successful exit.  However, it
-will have its hold count incremented.
+The vnode should be locked on entry and will still be locked on exit.
+The parent directory vnode will be unlocked on a successful exit.
+However, it will have its use count incremented.
 .Sh RETURN VALUES
 Zero is returned on success, otherwise an error code is returned.
 .Sh ERRORS
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
index a880961..65fc902 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -1594,7 +1594,7 @@ zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap)
 		*ap->a_buflen -= len;
 		bcopy(sep->se_name, ap->a_buf + *ap->a_buflen, len);
 		mutex_exit(&sdp->sd_lock);
-		vhold(dvp);
+		vref(dvp);
 		*ap->a_vpp = dvp;
 	}
 	VN_RELE(dvp);
diff --git a/sys/fs/devfs/devfs_vnops.c b/sys/fs/devfs/devfs_vnops.c
index eb154b1..22908b9 100644
--- a/sys/fs/devfs/devfs_vnops.c
+++ b/sys/fs/devfs/devfs_vnops.c
@@ -261,7 +261,7 @@ devfs_vptocnp(struct vop_vptocnp_args *ap)
 	} else if (vp->v_type == VDIR) {
 		if (dd == dmp->dm_rootdir) {
 			*dvp = vp;
-			vhold(*dvp);
+			vref(*dvp);
 			goto finished;
 		}
 		i -= dd->de_dirent->d_namlen;
@@ -289,6 +289,8 @@ devfs_vptocnp(struct vop_vptocnp_args *ap)
 		mtx_unlock(&devfs_de_interlock);
 		vholdl(*dvp);
 		VI_UNLOCK(*dvp);
+		vref(*dvp);
+		vdrop(*dvp);
 	} else {
 		mtx_unlock(&devfs_de_interlock);
 		error = ENOENT;
diff --git a/sys/fs/nullfs/null_subr.c b/sys/fs/nullfs/null_subr.c
index 5717a01..a45ccf7 100644
--- a/sys/fs/nullfs/null_subr.c
+++ b/sys/fs/nullfs/null_subr.c
@@ -289,22 +289,12 @@ null_checkvp(vp, fil, lno)
 #endif
 	if (a->null_lowervp == NULLVP) {
 		/* Should never happen */
-		int i; u_long *p;
-		printf("vp = %p, ZERO ptr\n", (void *)vp);
-		for (p = (u_long *) a, i = 0; i < 8; i++)
-			printf(" %lx", p[i]);
-		printf("\n");
-		panic("null_checkvp");
+		panic("null_checkvp %p", vp);
 	}
 	VI_LOCK_FLAGS(a->null_lowervp, MTX_DUPOK);
-	if (a->null_lowervp->v_usecount < 1) {
-		int i; u_long *p;
-		printf("vp = %p, unref'ed lowervp\n", (void *)vp);
-		for (p = (u_long *) a, i = 0; i < 8; i++)
-			printf(" %lx", p[i]);
-		printf("\n");
-		panic ("null with unref'ed lowervp");
-	}
+	if (a->null_lowervp->v_usecount < 1)
+		panic ("null with unref'ed lowervp, vp %p lvp %p",
+		    vp, a->null_lowervp);
 	VI_UNLOCK(a->null_lowervp);
 #ifdef notyet
 	printf("null %x/%d -> %x/%d [%s, %d]\n",
diff --git a/sys/fs/nullfs/null_vnops.c b/sys/fs/nullfs/null_vnops.c
index 30a38da..bcf8750 100644
--- a/sys/fs/nullfs/null_vnops.c
+++ b/sys/fs/nullfs/null_vnops.c
@@ -729,7 +729,7 @@ null_print(struct vop_print_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 
-	printf("\tvp=%p, lowervp=%p\n", vp, NULLVPTOLOWERVP(vp));
+	printf("\tvp=%p, lowervp=%p\n", vp, VTONULL(vp)->null_lowervp);
 	return (0);
 }
 
@@ -784,6 +784,7 @@ null_vptocnp(struct vop_vptocnp_args *ap)
 	vhold(lvp);
 	VOP_UNLOCK(vp, 0); /* vp is held by vn_vptocnp_locked that called us */
 	ldvp = lvp;
+	vref(lvp);
 	error = vn_vptocnp(&ldvp, cred, ap->a_buf, ap->a_buflen);
 	vdrop(lvp);
 	if (error != 0) {
@@ -797,19 +798,17 @@ null_vptocnp(struct vop_vptocnp_args *ap)
 	 */
 	error = vn_lock(ldvp, LK_EXCLUSIVE);
 	if (error != 0) {
+		vrele(ldvp);
 		vn_lock(vp, locked | LK_RETRY);
-		vdrop(ldvp);
 		return (ENOENT);
 	}
 	vref(ldvp);
-	vdrop(ldvp);
 	error = null_nodeget(vp->v_mount, ldvp, dvp);
 	if (error == 0) {
 #ifdef DIAGNOSTIC
 		NULLVPTOLOWERVP(*dvp);
 #endif
-		vhold(*dvp);
-		vput(*dvp);
+		VOP_UNLOCK(*dvp, 0); /* keep reference on *dvp */
 	} else
 		vput(ldvp);
 
diff --git a/sys/fs/pseudofs/pseudofs_vnops.c b/sys/fs/pseudofs/pseudofs_vnops.c
index 27b2605..a89174c 100644
--- a/sys/fs/pseudofs/pseudofs_vnops.c
+++ b/sys/fs/pseudofs/pseudofs_vnops.c
@@ -410,8 +410,7 @@ pfs_vptocnp(struct vop_vptocnp_args *ap)
 	}
 
 	*buflen = i;
-	vhold(*dvp);
-	vput(*dvp);
+	VOP_UNLOCK(*dvp, 0);
 	vn_lock(vp, locked | LK_RETRY);
 	vfs_unbusy(mp);
 
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index 11efcab..177fd56 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -1068,16 +1068,8 @@ vn_vptocnp(struct vnode **vp, struct ucred *cred, char *buf, u_int *buflen)
 
 	CACHE_RLOCK();
 	error = vn_vptocnp_locked(vp, cred, buf, buflen);
-	if (error == 0) {
-		/*
-		 * vn_vptocnp_locked() dropped hold acquired by
-		 * VOP_VPTOCNP immediately after locking the
-		 * cache. Since we are going to drop the cache rlock,
-		 * re-hold the result.
-		 */
-		vhold(*vp);
+	if (error == 0)
 		CACHE_RUNLOCK();
-	}
 	return (error);
 }
 
@@ -1096,6 +1088,9 @@ vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
 	if (ncp != NULL) {
 		if (*buflen < ncp->nc_nlen) {
 			CACHE_RUNLOCK();
+			vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
+			vrele(*vp);
+			VFS_UNLOCK_GIANT(vfslocked);
 			numfullpathfail4++;
 			error = ENOMEM;
 			SDT_PROBE(vfs, namecache, fullpath, return, error,
@@ -1106,18 +1101,23 @@ vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
 		memcpy(buf + *buflen, ncp->nc_name, ncp->nc_nlen);
 		SDT_PROBE(vfs, namecache, fullpath, hit, ncp->nc_dvp,
 		    ncp->nc_name, vp, 0, 0);
+		dvp = *vp;
 		*vp = ncp->nc_dvp;
+		vref(*vp);
+		CACHE_RUNLOCK();
+		vfslocked = VFS_LOCK_GIANT(dvp->v_mount);
+		vrele(dvp);
+		VFS_UNLOCK_GIANT(vfslocked);
+		CACHE_RLOCK();
 		return (0);
 	}
 	SDT_PROBE(vfs, namecache, fullpath, miss, vp, 0, 0, 0, 0);
 
-	vhold(*vp);
 	CACHE_RUNLOCK();
 	vfslocked = VFS_LOCK_GIANT((*vp)->v_mount);
 	vn_lock(*vp, LK_SHARED | LK_RETRY);
 	error = VOP_VPTOCNP(*vp, &dvp, cred, buf, buflen);
-	VOP_UNLOCK(*vp, 0);
-	vdrop(*vp);
+	vput(*vp);
 	VFS_UNLOCK_GIANT(vfslocked);
 	if (error) {
 		numfullpathfail2++;
@@ -1128,16 +1128,20 @@ vn_vptocnp_locked(struct vnode **vp, struct ucred *cred, char *buf,
 
 	*vp = dvp;
 	CACHE_RLOCK();
-	if ((*vp)->v_iflag & VI_DOOMED) {
+	if (dvp->v_iflag & VI_DOOMED) {
 		/* forced unmount */
 		CACHE_RUNLOCK();
-		vdrop(*vp);
+		vfslocked = VFS_LOCK_GIANT(dvp->v_mount);
+		vrele(dvp);
+		VFS_UNLOCK_GIANT(vfslocked);
 		error = ENOENT;
 		SDT_PROBE(vfs, namecache, fullpath, return, error, vp,
 		    NULL, 0, 0);
 		return (error);
 	}
-	vdrop(*vp);
+	/*
+	 * *vp has its use count incremented still.
+	 */
 
 	return (0);
 }
@@ -1149,10 +1153,11 @@ static int
 vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
     char *buf, char **retbuf, u_int buflen)
 {
-	int error, slash_prefixed;
+	int error, slash_prefixed, vfslocked;
 #ifdef KDTRACE_HOOKS
 	struct vnode *startvp = vp;
 #endif
+	struct vnode *vp1;
 
 	buflen--;
 	buf[buflen] = '\0';
@@ -1161,6 +1166,7 @@ vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
 
 	SDT_PROBE(vfs, namecache, fullpath, entry, vp, 0, 0, 0, 0);
 	numfullpathcalls++;
+	vref(vp);
 	CACHE_RLOCK();
 	if (vp->v_type != VDIR) {
 		error = vn_vptocnp_locked(&vp, td->td_ucred, buf, &buflen);
@@ -1168,6 +1174,9 @@ vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
 			return (error);
 		if (buflen == 0) {
 			CACHE_RUNLOCK();
+			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+			vrele(vp);
+			VFS_UNLOCK_GIANT(vfslocked);
 			return (ENOMEM);
 		}
 		buf[--buflen] = '/';
@@ -1177,16 +1186,29 @@ vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
 		if (vp->v_vflag & VV_ROOT) {
 			if (vp->v_iflag & VI_DOOMED) {	/* forced unmount */
 				CACHE_RUNLOCK();
+				vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+				vrele(vp);
+				VFS_UNLOCK_GIANT(vfslocked);
 				error = ENOENT;
 				SDT_PROBE(vfs, namecache, fullpath, return,
 				    error, vp, NULL, 0, 0);
 				break;
 			}
-			vp = vp->v_mount->mnt_vnodecovered;
+			vp1 = vp->v_mount->mnt_vnodecovered;
+			vref(vp1);
+			CACHE_RUNLOCK();
+			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+			vrele(vp);
+			VFS_UNLOCK_GIANT(vfslocked);
+			vp = vp1;
+			CACHE_RLOCK();
 			continue;
 		}
 		if (vp->v_type != VDIR) {
 			CACHE_RUNLOCK();
+			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+			vrele(vp);
+			VFS_UNLOCK_GIANT(vfslocked);
 			numfullpathfail1++;
 			error = ENOTDIR;
 			SDT_PROBE(vfs, namecache, fullpath, return,
@@ -1198,6 +1220,9 @@ vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
 			break;
 		if (buflen == 0) {
 			CACHE_RUNLOCK();
+			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+			vrele(vp);
+			VFS_UNLOCK_GIANT(vfslocked);
 			error = ENOMEM;
 			SDT_PROBE(vfs, namecache, fullpath, return, error,
 			    startvp, NULL, 0, 0);
@@ -1211,6 +1236,9 @@ vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
 	if (!slash_prefixed) {
 		if (buflen == 0) {
 			CACHE_RUNLOCK();
+			vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+			vrele(vp);
+			VFS_UNLOCK_GIANT(vfslocked);
 			numfullpathfail4++;
 			SDT_PROBE(vfs, namecache, fullpath, return, ENOMEM,
 			    startvp, NULL, 0, 0);
@@ -1220,6 +1248,9 @@ vn_fullpath1(struct thread *td, struct vnode *vp, struct vnode *rdir,
 	}
 	numfullpathfound++;
 	CACHE_RUNLOCK();
+	vfslocked = VFS_LOCK_GIANT(vp->v_mount);
+	vrele(vp);
+	VFS_UNLOCK_GIANT(vfslocked);
 
 	SDT_PROBE(vfs, namecache, fullpath, return, 0, startvp, buf + buflen,
 	    0, 0);
diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c
index e9f8151..e47498e 100644
--- a/sys/kern/vfs_default.c
+++ b/sys/kern/vfs_default.c
@@ -844,7 +844,7 @@ out:
 	free(dirbuf, M_TEMP);
 	if (!error) {
 		*buflen = i;
-		vhold(*dvp);
+		vref(*dvp);
 	}
 	if (covered) {
 		vput(*dvp);
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 196 bytes
Desc: not available
Url : http://lists.freebsd.org/pipermail/freebsd-fs/attachments/20111116/f26f1921/attachment.pgp


More information about the freebsd-fs mailing list