quota deadlock on 6.1-RC1

Dmitry Morozovsky marck at rinet.ru
Sun May 7 16:29:12 UTC 2006


On Sun, 7 May 2006, Konstantin Belousov wrote:

KB> > I'm running RELENG_6 with hand-merged sys/ufs/ffs changes from 1 to 7 May.

KB> It would be great to show the patchset. Also note that relevant changes are
KB> not limited to ufs/ffs. For instance, rev. 1.671 of kern/vfs_subr.c is
KB> also important, as well as rev. 1.82 of ufs/ufs/ufs_quota.c, that
KB> deals explicitely with combination of snapshots and quotas.

Actually I did (see attachment) ;-)

Sincerely,
D.Marck                                     [DM5020, MCK-RIPE, DM3-RIPN]
------------------------------------------------------------------------
*** Dmitry Morozovsky --- D.Marck --- Wild Woozle --- marck at rinet.ru ***
------------------------------------------------------------------------
-------------- next part --------------
Index: kern/vfs_subr.c
===================================================================
RCS file: /home/ncvs/src/sys/kern/vfs_subr.c,v
retrieving revision 1.635.2.16.2.1
diff -u -r1.635.2.16.2.1 vfs_subr.c
--- kern/vfs_subr.c	4 May 2006 07:42:10 -0000	1.635.2.16.2.1
+++ kern/vfs_subr.c	7 May 2006 16:27:52 -0000
@@ -2839,7 +2839,6 @@
 	struct vnode *vp, *mvp;
 	struct vm_object *obj;
 
-	(void) vn_start_write(NULL, &mp, V_WAIT);
 	MNT_ILOCK(mp);
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
@@ -2870,7 +2869,6 @@
 			VI_UNLOCK(vp);
 	}
 	MNT_IUNLOCK(mp);
-	vn_finished_write(mp);
 }
 
 /*
Index: ufs/ffs/ffs_rawread.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_rawread.c,v
retrieving revision 1.25.2.2
diff -u -r1.25.2.2 ffs_rawread.c
--- ufs/ffs/ffs_rawread.c	9 Mar 2006 00:18:45 -0000	1.25.2.2
+++ ufs/ffs/ffs_rawread.c	7 May 2006 16:27:56 -0000
@@ -129,8 +129,16 @@
 			upgraded = 0;
 			
 		
-		/* Attempt to msync mmap() regions to clean dirty mmap */ 
 		VI_LOCK(vp);
+		/* Check if vnode was reclaimed while unlocked. */
+		if ((vp->v_iflag & VI_DOOMED) != 0) {
+			VI_UNLOCK(vp);
+			if (upgraded != 0)
+				VOP_LOCK(vp, LK_DOWNGRADE, td);
+			vn_finished_write(mp);
+			return (EIO);
+		}
+		/* Attempt to msync mmap() regions to clean dirty mmap */ 
 		if ((vp->v_iflag & VI_OBJDIRTY) != 0) {
 			VI_UNLOCK(vp);
 			if (vp->v_object != NULL) {
@@ -150,6 +158,7 @@
 			VI_UNLOCK(vp);
 			if (upgraded != 0)
 				VOP_LOCK(vp, LK_DOWNGRADE, td);
+			vn_finished_write(mp);
 			return (error);
 		}
 		/* Flush dirty buffers */
@@ -159,6 +168,7 @@
 			if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) {
 				if (upgraded != 0)
 					VOP_LOCK(vp, LK_DOWNGRADE, td);
+				vn_finished_write(mp);
 				return (error);
 			}
 			VI_LOCK(vp);
Index: ufs/ffs/ffs_snapshot.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_snapshot.c,v
retrieving revision 1.103.2.5
diff -u -r1.103.2.5 ffs_snapshot.c
--- ufs/ffs/ffs_snapshot.c	22 Mar 2006 17:42:31 -0000	1.103.2.5
+++ ufs/ffs/ffs_snapshot.c	7 May 2006 16:27:56 -0000
@@ -36,6 +36,8 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD: src/sys/ufs/ffs/ffs_snapshot.c,v 1.103.2.5 2006/03/22 17:42:31 tegge Exp $");
 
+#include "opt_quota.h"
+
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
@@ -104,6 +106,7 @@
     struct fs *, ufs_lbn_t, int);
 static int readblock(struct vnode *vp, struct buf *, ufs2_daddr_t);
 static void process_deferred_inactive(struct mount *);
+static void try_free_snapdata(struct vnode *devvp, struct thread *td);
 
 /*
  * To ensure the consistency of snapshots across crashes, we must
@@ -306,6 +309,18 @@
 		if (error)
 			goto out;
 	}
+#ifdef QUOTA
+	/*
+	 * Turn off disk quotas for snapshot file.
+	 */
+	(void) chkdq(ip, -DIP(ip, i_blocks), KERNCRED, FORCE);
+	for (i = 0; i < MAXQUOTAS; i++) {
+		if (ip->i_dquot[i] != NODQUOT) {
+			dqrele(vp, ip->i_dquot[i]);
+			ip->i_dquot[i] = NODQUOT;
+		}
+	}
+#endif
 	/*
 	 * Change inode to snapshot type file.
 	 */
@@ -348,6 +363,11 @@
 		vn_start_write(NULL, &wrtmp, V_WAIT);
 	}
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (ip->i_effnlink == 0) {
+		error = ENOENT;		/* Snapshot file unlinked */
+		sn = NULL;
+		goto out1;
+	}
 	if (collectsnapstats)
 		nanotime(&starttime);
 	/*
@@ -551,7 +571,6 @@
 	}
 	lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 	    VI_MTX(vp), td);
-	transferlockers(&vp->v_lock, vp->v_vnlock);
 	lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 	/*
 	 * If this is the first snapshot on this filesystem, then we need
@@ -707,12 +726,17 @@
 	 * the inode for this snapshot then a deadlock can occur. Drop
 	 * the snapshot lock until the buffer has been written.
 	 */
+	VREF(vp);	/* Protect against ffs_snapgone() */
 	VOP_UNLOCK(vp, 0, td);
 	(void) bread(ip->i_devvp,
 		     fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
 		     (int) fs->fs_bsize, NOCRED, &nbp);
 	brelse(nbp);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, td);
+	if (ip->i_effnlink == 0)
+		error = ENOENT;		/* Snapshot file unlinked */
+	else
+		vrele(vp);		/* Drop extra reference */
 done:
 	FREE(copy_fs->fs_csp, M_UFSMNT);
 	bawrite(sbp);
@@ -1469,18 +1493,16 @@
 {
 	struct inode *ip;
 	struct vnode *devvp;
-	struct lock *lkp;
 	struct buf *ibp;
 	struct fs *fs;
 	struct thread *td = curthread;
-	ufs2_daddr_t numblks, blkno, dblk, *snapblklist;
+	ufs2_daddr_t numblks, blkno, dblk;
 	int error, loc, last;
 	struct snapdata *sn;
 
 	ip = VTOI(vp);
 	fs = ip->i_fs;
 	devvp = ip->i_devvp;
-	sn = devvp->v_rdev->si_snapdata;
 	/*
 	 * If active, delete from incore list (this snapshot may
 	 * already have been in the process of being deleted, so
@@ -1488,29 +1510,23 @@
 	 *
 	 * Clear copy-on-write flag if last snapshot.
 	 */
+	VI_LOCK(devvp);
 	if (ip->i_nextsnap.tqe_prev != 0) {
-		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td);
-		VI_LOCK(devvp);
+		sn = devvp->v_rdev->si_snapdata;
 		TAILQ_REMOVE(&sn->sn_head, ip, i_nextsnap);
 		ip->i_nextsnap.tqe_prev = 0;
-		lkp = vp->v_vnlock;
+		VI_UNLOCK(devvp);
+		lockmgr(&vp->v_lock, LK_EXCLUSIVE, NULL, td);
+		VI_LOCK(vp);
+		KASSERT(vp->v_vnlock == &sn->sn_lock,
+			("ffs_snapremove: lost lock mutation")); 
 		vp->v_vnlock = &vp->v_lock;
-		lockmgr(lkp, LK_RELEASE, NULL, td);
-		if (TAILQ_FIRST(&sn->sn_head) != 0) {
-			VI_UNLOCK(devvp);
-		} else {
-			snapblklist = sn->sn_blklist;
-			sn->sn_blklist = 0;
-			sn->sn_listsize = 0;
-			devvp->v_rdev->si_snapdata = NULL;
-			devvp->v_vflag &= ~VV_COPYONWRITE;
-			lockmgr(lkp, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
-			lockmgr(lkp, LK_RELEASE, NULL, td);
-			lockdestroy(lkp);
-			free(sn, M_UFSMNT);
-			FREE(snapblklist, M_UFSMNT);
-		}
-	}
+		VI_UNLOCK(vp);
+		VI_LOCK(devvp);
+		lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
+		try_free_snapdata(devvp, td);
+	} else
+		VI_UNLOCK(devvp);
 	/*
 	 * Clear all BLK_NOCOPY fields. Pass any block claims to other
 	 * snapshots that want them (see ffs_snapblkfree below).
@@ -1575,6 +1591,13 @@
 	ip->i_flags &= ~SF_SNAPSHOT;
 	DIP_SET(ip, i_flags, ip->i_flags);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
+#ifdef QUOTA
+	/*
+	 * Reenable disk quotas for ex-snapshot file.
+	 */
+	if (!getinoquota(ip))
+		(void) chkdq(ip, DIP(ip, i_blocks), KERNCRED, FORCE);
+#endif
 }
 
 /*
@@ -1866,7 +1889,6 @@
 		}
 		lockmgr(vp->v_vnlock, LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY,
 		    VI_MTX(vp), td);
-		transferlockers(&vp->v_lock, vp->v_vnlock);
 		lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
 		/*
 		 * Link it onto the active snapshot list.
@@ -1939,30 +1961,35 @@
 	struct snapdata *sn;
 	struct inode *xp;
 	struct vnode *vp;
+	struct thread *td = curthread;
 
-	sn = devvp->v_rdev->si_snapdata;
 	VI_LOCK(devvp);
-	while ((xp = TAILQ_FIRST(&sn->sn_head)) != 0) {
+	sn = devvp->v_rdev->si_snapdata;
+	while (sn != NULL && (xp = TAILQ_FIRST(&sn->sn_head)) != NULL) {
 		vp = ITOV(xp);
-		vp->v_vnlock = &vp->v_lock;
 		TAILQ_REMOVE(&sn->sn_head, xp, i_nextsnap);
 		xp->i_nextsnap.tqe_prev = 0;
-		if (xp->i_effnlink > 0) {
-			VI_UNLOCK(devvp);
+		lockmgr(&sn->sn_lock, 
+			LK_INTERLOCK | LK_EXCLUSIVE,
+			VI_MTX(devvp),
+			td);
+		VI_LOCK(vp);
+		lockmgr(&vp->v_lock,
+			LK_INTERLOCK | LK_EXCLUSIVE,
+			VI_MTX(vp), td);
+		VI_LOCK(vp);
+		KASSERT(vp->v_vnlock == &sn->sn_lock,
+		("ffs_snapshot_unmount: lost lock mutation")); 
+		vp->v_vnlock = &vp->v_lock;
+		VI_UNLOCK(vp);
+		lockmgr(&vp->v_lock, LK_RELEASE, NULL, td);
+		lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
+		if (xp->i_effnlink > 0)
 			vrele(vp);
-			VI_LOCK(devvp);
-		}
-	}
-	devvp->v_rdev->si_snapdata = NULL;
-	devvp->v_vflag &= ~VV_COPYONWRITE;
-	VI_UNLOCK(devvp);
-	if (sn->sn_blklist != NULL) {
-		FREE(sn->sn_blklist, M_UFSMNT);
-		sn->sn_blklist = NULL;
-		sn->sn_listsize = 0;
+		VI_LOCK(devvp);
+		sn = devvp->v_rdev->si_snapdata;
 	}
-	lockdestroy(&sn->sn_lock);
-	free(sn, M_UFSMNT);
+	try_free_snapdata(devvp, td);
 	ASSERT_VOP_LOCKED(devvp, "ffs_snapshot_unmount");
 }
 
@@ -1984,6 +2011,7 @@
 	ufs2_daddr_t lbn, blkno, *snapblklist;
 	int lower, upper, mid, indiroff, error = 0;
 	int launched_async_io, prev_norunningbuf;
+	long saved_runningbufspace;
 
 	if ((VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
 		return (0);		/* Update on a snapshot file */
@@ -2026,7 +2054,9 @@
 	 * for a long time waiting on snaplk, back it out of
 	 * runningbufspace, possibly waking other threads waiting for space.
 	 */
-	runningbufwakeup(bp);
+	saved_runningbufspace = bp->b_runningbufspace;
+	if (saved_runningbufspace != 0)
+		runningbufwakeup(bp);
 	/*
 	 * Not in the precomputed list, so check the snapshots.
 	 */
@@ -2038,9 +2068,11 @@
 		if (sn == NULL ||
 		    TAILQ_FIRST(&sn->sn_head) == NULL) {
 			VI_UNLOCK(devvp);
-			if (bp->b_runningbufspace)
+			if (saved_runningbufspace != 0) {
+				bp->b_runningbufspace = saved_runningbufspace;
 				atomic_add_int(&runningbufspace,
 					       bp->b_runningbufspace);
+			}
 			return (0);		/* Snapshot gone */
 		}
 	}
@@ -2161,8 +2193,10 @@
 	/*
 	 * I/O on bp will now be started, so count it in runningbufspace.
 	 */
-	if (bp->b_runningbufspace)
+	if (saved_runningbufspace != 0) {
+		bp->b_runningbufspace = saved_runningbufspace;
 		atomic_add_int(&runningbufspace, bp->b_runningbufspace);
+	}
 	return (error);
 }
 
@@ -2184,13 +2218,10 @@
 	bip->bio_offset = dbtob(fsbtodb(ip->i_fs, blkstofrags(ip->i_fs, lbn)));
 	bip->bio_data = bp->b_data;
 	bip->bio_length = bp->b_bcount;
+	bip->bio_done = NULL;
 
 	g_io_request(bip, ip->i_devvp->v_bufobj.bo_private);
-
-	do 
-		msleep(bip, NULL, PRIBIO, "snaprdb", hz/10);
-	while (!(bip->bio_flags & BIO_DONE));
-	bp->b_error = bip->bio_error;
+	bp->b_error = biowait(bip, "snaprdb");
 	g_destroy_bio(bip);
 	return (bp->b_error);
 }
@@ -2259,3 +2290,32 @@
 	MNT_IUNLOCK(mp);
 	vn_finished_secondary_write(mp);
 }
+
+/* Try to free snapdata associated with devvp */
+static void
+try_free_snapdata(struct vnode *devvp,
+		  struct thread *td)
+{
+	struct snapdata *sn;
+	ufs2_daddr_t *snapblklist;
+
+	sn = devvp->v_rdev->si_snapdata;
+
+	if (sn == NULL || TAILQ_FIRST(&sn->sn_head) != NULL ||
+	    (devvp->v_vflag & VV_COPYONWRITE) == 0) {
+		VI_UNLOCK(devvp);
+		return;
+	}
+
+	devvp->v_rdev->si_snapdata = NULL;
+	devvp->v_vflag &= ~VV_COPYONWRITE;
+	snapblklist = sn->sn_blklist;
+	sn->sn_blklist = NULL;
+	sn->sn_listsize = 0;
+	lockmgr(&sn->sn_lock, LK_DRAIN|LK_INTERLOCK, VI_MTX(devvp), td);
+	lockmgr(&sn->sn_lock, LK_RELEASE, NULL, td);
+	lockdestroy(&sn->sn_lock);
+	free(sn, M_UFSMNT);
+	if (snapblklist != NULL)
+		FREE(snapblklist, M_UFSMNT);
+}
Index: ufs/ffs/ffs_vfsops.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_vfsops.c,v
retrieving revision 1.290.2.9
diff -u -r1.290.2.9 ffs_vfsops.c
--- ufs/ffs/ffs_vfsops.c	22 Mar 2006 17:54:50 -0000	1.290.2.9
+++ ufs/ffs/ffs_vfsops.c	7 May 2006 16:27:56 -0000
@@ -1728,6 +1728,7 @@
 		if ((vp->v_vflag & VV_COPYONWRITE) &&
 		    vp->v_rdev->si_snapdata != NULL) {
 			if ((bp->b_flags & B_CLUSTER) != 0) {
+				runningbufwakeup(bp);
 				TAILQ_FOREACH(tbp, &bp->b_cluster.cluster_head,
 					      b_cluster.cluster_entry) {
 					error = ffs_copyonwrite(vp, tbp);
@@ -1739,6 +1740,9 @@
 						return;
 					}
 				}
+				bp->b_runningbufspace = bp->b_bufsize;
+				atomic_add_int(&runningbufspace,
+					       bp->b_runningbufspace);
 			} else {
 				error = ffs_copyonwrite(vp, bp);
 				if (error != 0 && error != EOPNOTSUPP) {
Index: ufs/ffs/ffs_vnops.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ffs/ffs_vnops.c,v
retrieving revision 1.157.2.1
diff -u -r1.157.2.1 ffs_vnops.c
--- ufs/ffs/ffs_vnops.c	29 Oct 2005 06:43:55 -0000	1.157.2.1
+++ ufs/ffs/ffs_vnops.c	7 May 2006 16:27:56 -0000
@@ -338,7 +338,54 @@
 		struct thread *a_td;
 	} */ *ap;
 {
+#ifndef NO_FFS_SNAPSHOT
+	struct vnode *vp;
+	int flags;
+	struct lock *lkp;
+	int result;
+	
+	switch (ap->a_flags & LK_TYPE_MASK) {
+	case LK_SHARED:
+	case LK_UPGRADE:
+	case LK_EXCLUSIVE:
+		vp = ap->a_vp;
+		flags = ap->a_flags;
+		for (;;) {
+			/*
+			 * vnode interlock must be held to ensure that
+			 * the possibly external lock isn't freed,
+			 * e.g. when mutating from snapshot file vnode
+			 * to regular file vnode.
+			 */
+			if ((flags & LK_INTERLOCK) == 0) {
+				VI_LOCK(vp);
+				flags |= LK_INTERLOCK;
+			}
+			lkp = vp->v_vnlock;
+			result = lockmgr(lkp, flags, VI_MTX(vp), ap->a_td);
+			if (lkp == vp->v_vnlock || result != 0)
+				break;
+			/*
+			 * Apparent success, except that the vnode
+			 * mutated between snapshot file vnode and
+			 * regular file vnode while this process
+			 * slept.  The lock currently held is not the
+			 * right lock.  Release it, and try to get the
+			 * new lock.
+			 */
+			(void) lockmgr(lkp, LK_RELEASE, VI_MTX(vp), ap->a_td);
+			if ((flags & LK_TYPE_MASK) == LK_UPGRADE)
+				flags = (flags & ~LK_TYPE_MASK) | LK_EXCLUSIVE;
+			flags &= ~LK_INTERLOCK;
+		}
+		break;
+	default:
+		result = VOP_LOCK_APV(&ufs_vnodeops, ap);
+	}
+	return (result);
+#else
 	return (VOP_LOCK_APV(&ufs_vnodeops, ap));
+#endif
 }
 
 /*
Index: ufs/ufs/ufs_quota.c
===================================================================
RCS file: /home/ncvs/src/sys/ufs/ufs/ufs_quota.c,v
retrieving revision 1.74.2.2.2.1
diff -u -r1.74.2.2.2.1 ufs_quota.c
--- ufs/ufs/ufs_quota.c	26 Apr 2006 01:23:59 -0000	1.74.2.2.2.1
+++ ufs/ufs/ufs_quota.c	7 May 2006 16:27:56 -0000
@@ -35,6 +35,8 @@
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
+#include "opt_ffs.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/fcntl.h>
@@ -46,6 +48,7 @@
 #include <sys/namei.h>
 #include <sys/proc.h>
 #include <sys/socket.h>
+#include <sys/stat.h>
 #include <sys/sysctl.h>
 #include <sys/vnode.h>
 
@@ -97,6 +100,13 @@
 	struct vnode *vp = ITOV(ip);
 	int error;
 
+#ifndef NO_FFS_SNAPSHOT
+	/*
+	 * Disk quotas must be turned off for snapshot files.
+	 */
+	if ((ip->i_flags & SF_SNAPSHOT) != 0)
+		return (0);
+#endif
 	ump = VFSTOUFS(vp->v_mount);
 	/*
 	 * Set up the user quota based on file uid.
@@ -375,6 +385,13 @@
 	struct ufsmount *ump = VFSTOUFS(ITOV(ip)->v_mount);
 	int i;
 
+#ifndef NO_FFS_SNAPSHOT
+	/*
+	 * Disk quotas must be turned off for snapshot files.
+	 */
+	if ((ip->i_flags & SF_SNAPSHOT) != 0)
+		return;
+#endif
 	for (i = 0; i < MAXQUOTAS; i++) {
 		if (ump->um_quotas[i] == NULLVP ||
 		    (ump->um_qflags[i] & (QTF_OPENING|QTF_CLOSING)))


More information about the freebsd-stable mailing list