svn commit: r233630 - in stable/9/sys: i386/conf kern sys ufs/ffs ufs/ufs

Wed Mar 28 21:34:56 UTC 2012

Author: mckusick
Date: Wed Mar 28 21:34:55 2012
New Revision: 233630
URL: http://svn.freebsd.org/changeset/base/233630

Log:
  MFC of 232351, 233438, and 233629
  
  MFC reviewed by: kib
  
  MFC 232351:
  
  This change avoids a kernel deadlock on "snaplk" when using
  snapshots on UFS filesystems running with journaled soft updates.
  This is the first of several bugs that need to be fixed before
  removing the restriction added in -r230250 to prevent the use
  of snapshots on filesystems running with journaled soft updates.
  
  The deadlock occurs when holding the snapshot lock (snaplk)
  and then trying to flush an inode via ffs_update(). We become
  blocked by another process trying to flush a different inode
  contained in the same inode block that we need. It holds the
  inode block for which we are waiting locked. When it tries to
  write the inode block, it gets blocked waiting for the our
  snaplk when it calls ffs_copyonwrite() to see if the inode
  block needs to be copied in our snapshot.
  
  The most obvious place that this deadlock arises is in the
  ffs_copyonwrite() routine when it updates critical metadata
  in a snapshot and tries to write it out before proceeding.
  The fix here is to write the data and indirect block pointer
  for the snapshot, but to skip the call to ffs_update() to
  write the snapshot inode. To ensure that we will never have
  to update a pointer in the inode itself, the ffs_snapshot()
  routine that creates the snapshot has to ensure that all the
  direct blocks are allocated as part of the creation of the
  snapshot.
  
  A less obvious place that this deadlock occurs is when we hold
  the snaplk because we are deleting a snapshot. In the course of
  doing the deletion, we need to allocate various soft update
  dependency structures and allocate some journal space. If we
  hit a resource limit while doing this we decrease the resources
  in use by flushing out an existing dirty file to get it to give
  up the soft dependency resources that it holds. The flush can
  cause an ffs_update() to be done on the inode for the file that
  we have selected to flush resulting in the same deadlock as
  described above when the inode that we have chosen to flush
  resides in the same inode block as the snapshot inode that we hold.
  The fix is to defer cleaning up any time that the inode on which
  we are operating is a snapshot.
  
  Help and review by:    Jeff Roberson
  Tested by:             Peter Holm
  
  MFC 233438:
  
  Add a third flags argument to ffs_syncvnode to avoid a possible conflict
  with MNT_WAIT flags that passed in its second argument.
  
  Discussed with: kib
  
  MFC 233629:
  
  A refinement of change 232351 to avoid a race with a forcible unmount.
  While we have a snapshot vnode unlocked to avoid a deadlock with another
  inode in the same inode block being updated, the filesystem containing
  it may be forcibly unmounted. When that happens the snapshot vnode is
  revoked. We need to check for that condition and fail appropriately.
  
  Spotted by:  kib
  Reviewed by: kib

Modified:
  stable/9/sys/kern/vfs_bio.c
  stable/9/sys/sys/buf.h
  stable/9/sys/ufs/ffs/ffs_balloc.c
  stable/9/sys/ufs/ffs/ffs_extern.h
  stable/9/sys/ufs/ffs/ffs_inode.c
  stable/9/sys/ufs/ffs/ffs_rawread.c
  stable/9/sys/ufs/ffs/ffs_snapshot.c
  stable/9/sys/ufs/ffs/ffs_softdep.c
  stable/9/sys/ufs/ffs/ffs_vfsops.c
  stable/9/sys/ufs/ffs/ffs_vnops.c
  stable/9/sys/ufs/ufs/inode.h
Directory Properties:
  stable/9/sys/   (props changed)
  stable/9/sys/amd64/include/xen/   (props changed)
  stable/9/sys/boot/   (props changed)
  stable/9/sys/boot/i386/efi/   (props changed)
  stable/9/sys/boot/ia64/efi/   (props changed)
  stable/9/sys/boot/ia64/ski/   (props changed)
  stable/9/sys/boot/powerpc/boot1.chrp/   (props changed)
  stable/9/sys/boot/powerpc/ofw/   (props changed)
  stable/9/sys/cddl/contrib/opensolaris/   (props changed)
  stable/9/sys/conf/   (props changed)
  stable/9/sys/contrib/dev/acpica/   (props changed)
  stable/9/sys/contrib/octeon-sdk/   (props changed)
  stable/9/sys/contrib/pf/   (props changed)
  stable/9/sys/contrib/x86emu/   (props changed)
  stable/9/sys/fs/   (props changed)
  stable/9/sys/fs/ntfs/   (props changed)
  stable/9/sys/i386/conf/XENHVM   (props changed)

Modified: stable/9/sys/kern/vfs_bio.c
==============================================================================

--- stable/9/sys/kern/vfs_bio.c	Wed Mar 28 21:21:19 2012	(r233629)
+++ stable/9/sys/kern/vfs_bio.c	Wed Mar 28 21:34:55 2012	(r233630)
@@ -782,19 +782,15 @@ bremfreel(struct buf *bp)
 	}
 }
 
-
 /*
- * Get a buffer with the specified data.  Look in the cache first.  We
- * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
- * is set, the buffer is valid and we do not have to do anything ( see
- * getblk() ).  This is really just a special case of breadn().
+ * Get a buffer with the specified data.
  */
 int
 bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
     struct buf **bpp)
 {
 
-	return (breadn(vp, blkno, size, 0, 0, 0, cred, bpp));
+	return (breadn_flags(vp, blkno, size, 0, 0, 0, cred, 0, bpp));
 }
 
 /*
@@ -842,11 +838,34 @@ breadn(struct vnode * vp, daddr_t blkno,
     daddr_t * rablkno, int *rabsize,
     int cnt, struct ucred * cred, struct buf **bpp)
 {
+
+	return (breadn_flags(vp, blkno, size, rablkno, rabsize, cnt,
+		    cred, 0, bpp));
+}
+
+/*
+ * Entry point for bread() and breadn().
+ *
+ * Get a buffer with the specified data.  Look in the cache first.  We
+ * must clear BIO_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
+ * is set, the buffer is valid and we do not have to do anything, see
+ * getblk(). Also starts asynchronous I/O on read-ahead blocks.
+ */
+int
+breadn_flags(struct vnode * vp, daddr_t blkno, int size,
+    daddr_t * rablkno, int *rabsize, int cnt,
+    struct ucred * cred, int flags, struct buf **bpp)
+{
 	struct buf *bp;
 	int rv = 0, readwait = 0;
 
 	CTR3(KTR_BUF, "breadn(%p, %jd, %d)", vp, blkno, size);
-	*bpp = bp = getblk(vp, blkno, size, 0, 0, 0);
+	/*
+	 * Can only return NULL if GB_LOCK_NOWAIT flag is specified.
+	 */
+	*bpp = bp = getblk(vp, blkno, size, 0, 0, flags);
+	if (bp == NULL)
+		return (EBUSY);
 
 	/* if not found in cache, do some I/O */
 	if ((bp->b_flags & B_CACHE) == 0) {

Modified: stable/9/sys/sys/buf.h
==============================================================================
--- stable/9/sys/sys/buf.h	Wed Mar 28 21:21:19 2012	(r233629)
+++ stable/9/sys/sys/buf.h	Wed Mar 28 21:34:55 2012	(r233630)
@@ -483,6 +483,8 @@ int	bread(struct vnode *, daddr_t, int, 
 void	breada(struct vnode *, daddr_t *, int *, int, struct ucred *);
 int	breadn(struct vnode *, daddr_t, int, daddr_t *, int *, int,
 	    struct ucred *, struct buf **);
+int	breadn_flags(struct vnode *, daddr_t, int, daddr_t *, int *, int,
+	    struct ucred *, int, struct buf **);
 void	bdwrite(struct buf *);
 void	bawrite(struct buf *);
 void	bdirty(struct buf *);

Modified: stable/9/sys/ufs/ffs/ffs_balloc.c
==============================================================================
--- stable/9/sys/ufs/ffs/ffs_balloc.c	Wed Mar 28 21:21:19 2012	(r233629)
+++ stable/9/sys/ufs/ffs/ffs_balloc.c	Wed Mar 28 21:34:55 2012	(r233630)
@@ -450,7 +450,7 @@ fail:
 	 *
 	 * XXX Still have to journal the free below
 	 */
-	(void) ffs_syncvnode(vp, MNT_WAIT);
+	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
 	     blkp < allocblk; blkp++, lbns_remfree++) {
 		/*
@@ -497,7 +497,7 @@ fail:
 		dp->di_blocks -= btodb(deallocated);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
-	(void) ffs_syncvnode(vp, MNT_WAIT);
+	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
 	/*
 	 * After the buffers are invalidated and on-disk pointers are
 	 * cleared, free the blocks.
@@ -994,7 +994,7 @@ fail:
 	 *
 	 * XXX Still have to journal the free below
 	 */
-	(void) ffs_syncvnode(vp, MNT_WAIT);
+	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
 	for (deallocated = 0, blkp = allociblk, lbns_remfree = lbns;
 	     blkp < allocblk; blkp++, lbns_remfree++) {
 		/*
@@ -1041,7 +1041,7 @@ fail:
 		dp->di_blocks -= btodb(deallocated);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	}
-	(void) ffs_syncvnode(vp, MNT_WAIT);
+	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
 	/*
 	 * After the buffers are invalidated and on-disk pointers are
 	 * cleared, free the blocks.

Modified: stable/9/sys/ufs/ffs/ffs_extern.h
==============================================================================
--- stable/9/sys/ufs/ffs/ffs_extern.h	Wed Mar 28 21:21:19 2012	(r233629)
+++ stable/9/sys/ufs/ffs/ffs_extern.h	Wed Mar 28 21:34:55 2012	(r233630)
@@ -92,7 +92,7 @@ void	ffs_snapshot_mount(struct mount *mp
 void	ffs_snapshot_unmount(struct mount *mp);
 void	process_deferred_inactive(struct mount *mp);
 void	ffs_sync_snap(struct mount *, int);
-int	ffs_syncvnode(struct vnode *vp, int waitfor);
+int	ffs_syncvnode(struct vnode *vp, int waitfor, int flags);
 int	ffs_truncate(struct vnode *, off_t, int, struct ucred *, struct thread *);
 int	ffs_update(struct vnode *, int);
 int	ffs_valloc(struct vnode *, int, struct ucred *, struct vnode **);
@@ -167,6 +167,12 @@ void	softdep_freework(struct workhead *)
 #define FLUSH_INODES_WAIT	2
 #define FLUSH_BLOCKS		3
 #define FLUSH_BLOCKS_WAIT	4
+/*
+ * Flag to ffs_syncvnode() to request flushing of data only,
+ * but skip the ffs_update() on the inode itself. Used to avoid
+ * deadlock when flushing snapshot inodes while holding snaplk.
+ */
+#define	NO_INO_UPDT		0x00000001
 
 int	ffs_rdonly(struct inode *);
 

Modified: stable/9/sys/ufs/ffs/ffs_inode.c
==============================================================================
--- stable/9/sys/ufs/ffs/ffs_inode.c	Wed Mar 28 21:21:19 2012	(r233629)
+++ stable/9/sys/ufs/ffs/ffs_inode.c	Wed Mar 28 21:34:55 2012	(r233630)
@@ -81,7 +81,7 @@ ffs_update(vp, waitfor)
 	struct fs *fs;
 	struct buf *bp;
 	struct inode *ip;
-	int error;
+	int flags, error;
 
 	ASSERT_VOP_ELOCKED(vp, "ffs_update");
 	ufs_itimes(vp);
@@ -92,11 +92,51 @@ ffs_update(vp, waitfor)
 	fs = ip->i_fs;
 	if (fs->fs_ronly && ip->i_ump->um_fsckpid == 0)
 		return (0);
-	error = bread(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
-		(int)fs->fs_bsize, NOCRED, &bp);
-	if (error) {
-		brelse(bp);
-		return (error);
+	/*
+	 * If we are updating a snapshot and another process is currently
+	 * writing the buffer containing the inode for this snapshot then
+	 * a deadlock can occur when it tries to check the snapshot to see
+	 * if that block needs to be copied. Thus when updating a snapshot
+	 * we check to see if the buffer is already locked, and if it is
+	 * we drop the snapshot lock until the buffer has been written
+	 * and is available to us. We have to grab a reference to the
+	 * snapshot vnode to prevent it from being removed while we are
+	 * waiting for the buffer.
+	 */
+	flags = 0;
+	if (IS_SNAPSHOT(ip))
+		flags = GB_LOCK_NOWAIT;
+loop:
+	error = breadn_flags(ip->i_devvp,
+	     fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
+	     (int) fs->fs_bsize, 0, 0, 0, NOCRED, flags, &bp);
+	if (error != 0) {
+		if (error != EBUSY) {
+			brelse(bp);
+			return (error);
+		}
+		KASSERT((IS_SNAPSHOT(ip)), ("EBUSY from non-snapshot"));
+		/*
+		 * Wait for our inode block to become available.
+		 *
+		 * Hold a reference to the vnode to protect against
+		 * ffs_snapgone(). Since we hold a reference, it can only
+		 * get reclaimed (VI_DOOMED flag) in a forcible downgrade
+		 * or unmount. For an unmount, the entire filesystem will be
+		 * gone, so we cannot attempt to touch anything associated
+		 * with it while the vnode is unlocked; all we can do is 
+		 * pause briefly and try again. If when we relock the vnode
+		 * we discover that it has been reclaimed, updating it is no
+		 * longer necessary and we can just return an error.
+		 */
+		vref(vp);
+		VOP_UNLOCK(vp, 0);
+		pause("ffsupd", 1);
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
+		vrele(vp);
+		if ((vp->v_iflag & VI_DOOMED) != 0)
+			return (ENOENT);
+		goto loop;
 	}
 	if (DOINGSOFTDEP(vp))
 		softdep_update_inodeblock(ip, bp, waitfor);
@@ -108,16 +148,16 @@ ffs_update(vp, waitfor)
 	else
 		*((struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
-	if (waitfor && !DOINGASYNC(vp)) {
-		return (bwrite(bp));
-	} else if (vm_page_count_severe() || buf_dirty_count_severe()) {
-		return (bwrite(bp));
+	if ((waitfor && !DOINGASYNC(vp)) ||
+	    (vm_page_count_severe() || buf_dirty_count_severe())) {
+		error = bwrite(bp);
 	} else {
 		if (bp->b_bufsize == fs->fs_bsize)
 			bp->b_flags |= B_CLUSTEROK;
 		bdwrite(bp);
-		return (0);
+		error = 0;
 	}
+	return (error);
 }
 
 #define	SINGLE	0	/* index of single indirect block */
@@ -201,7 +241,7 @@ ffs_truncate(vp, length, flags, cred, td
 				goto extclean;
 			needextclean = 1;
 		} else {
-			if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
+			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 				return (error);
 #ifdef QUOTA
 			(void) chkdq(ip, -extblocks, NOCRED, 0);
@@ -253,7 +293,7 @@ ffs_truncate(vp, length, flags, cred, td
 	}
 	if (fs->fs_ronly)
 		panic("ffs_truncate: read-only filesystem");
-	if ((ip->i_flags & SF_SNAPSHOT) != 0)
+	if (IS_SNAPSHOT(ip))
 		ffs_snapremove(vp);
 	vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
 	osize = ip->i_size;
@@ -294,7 +334,7 @@ ffs_truncate(vp, length, flags, cred, td
 			 * rarely, we solve the problem by syncing the file
 			 * so that it will have no data structures left.
 			 */
-			if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
+			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 				return (error);
 		} else {
 			flags = IO_NORMAL | (needextclean ? IO_EXT: 0);
@@ -339,7 +379,7 @@ ffs_truncate(vp, length, flags, cred, td
 		 */
 		if (DOINGSOFTDEP(vp) && lbn < NDADDR &&
 		    fragroundup(fs, blkoff(fs, length)) < fs->fs_bsize &&
-		    (error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
+		    (error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 			return (error);
 		ip->i_size = length;
 		DIP_SET(ip, i_size, length);

Modified: stable/9/sys/ufs/ffs/ffs_rawread.c
==============================================================================
--- stable/9/sys/ufs/ffs/ffs_rawread.c	Wed Mar 28 21:21:19 2012	(r233629)
+++ stable/9/sys/ufs/ffs/ffs_rawread.c	Wed Mar 28 21:34:55 2012	(r233630)
@@ -163,7 +163,7 @@ ffs_rawread_sync(struct vnode *vp)
 		/* Flush dirty buffers */
 		if (bo->bo_dirty.bv_cnt > 0) {
 			BO_UNLOCK(bo);
-			if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0) {
+			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0) {
 				if (upgraded != 0)
 					VOP_LOCK(vp, LK_DOWNGRADE);
 				vn_finished_write(mp);

Modified: stable/9/sys/ufs/ffs/ffs_snapshot.c
==============================================================================
--- stable/9/sys/ufs/ffs/ffs_snapshot.c	Wed Mar 28 21:21:19 2012	(r233629)
+++ stable/9/sys/ufs/ffs/ffs_snapshot.c	Wed Mar 28 21:34:55 2012	(r233630)
@@ -203,6 +203,7 @@ ffs_snapshot(mp, snapfile)
 	ufs2_daddr_t numblks, blkno, *blkp, *snapblklist;
 	int error, cg, snaploc;
 	int i, size, len, loc;
+	ufs2_daddr_t blockno;
 	uint64_t flag;
 	struct timespec starttime = {0, 0}, endtime;
 	char saved_nice = 0;
@@ -361,7 +362,7 @@ restart:
 			goto out;
 		bawrite(nbp);
 		if (cg % 10 == 0)
-			ffs_syncvnode(vp, MNT_WAIT);
+			ffs_syncvnode(vp, MNT_WAIT, 0);
 	}
 	/*
 	 * Copy all the cylinder group maps. Although the
@@ -384,7 +385,7 @@ restart:
 		error = cgaccount(cg, vp, nbp, 1);
 		bawrite(nbp);
 		if (cg % 10 == 0)
-			ffs_syncvnode(vp, MNT_WAIT);
+			ffs_syncvnode(vp, MNT_WAIT, 0);
 		if (error)
 			goto out;
 	}
@@ -399,7 +400,7 @@ restart:
 	 * Since we have marked it as a snapshot it is safe to
 	 * unlock it as no process will be allowed to write to it.
 	 */
-	if ((error = ffs_syncvnode(vp, MNT_WAIT)) != 0)
+	if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)) != 0)
 		goto out;
 	VOP_UNLOCK(vp, 0);
 	/*
@@ -529,7 +530,7 @@ loop:
 		    (xvp->v_usecount == 0 &&
 		     (xvp->v_iflag & (VI_OWEINACT | VI_DOINGINACT)) == 0) ||
 		    xvp->v_type == VNON ||
-		    (VTOI(xvp)->i_flags & SF_SNAPSHOT)) {
+		    IS_SNAPSHOT(VTOI(xvp))) {
 			VI_UNLOCK(xvp);
 			MNT_ILOCK(mp);
 			continue;
@@ -815,21 +816,26 @@ out1:
 	if (space != NULL)
 		free(space, M_UFSMNT);
 	/*
-	 * If another process is currently writing the buffer containing
-	 * the inode for this snapshot then a deadlock can occur. Drop
-	 * the snapshot lock until the buffer has been written.
+	 * Preallocate all the direct blocks in the snapshot inode so
+	 * that we never have to write the inode itself to commit an
+	 * update to the contents of the snapshot. Note that once
+	 * created, the size of the snapshot will never change, so
+	 * there will never be a need to write the inode except to
+	 * update the non-integrity-critical time fields and
+	 * allocated-block count.
 	 */
-	VREF(vp);	/* Protect against ffs_snapgone() */
-	VOP_UNLOCK(vp, 0);
-	(void) bread(ip->i_devvp,
-		     fsbtodb(fs, ino_to_fsba(fs, ip->i_number)),
-		     (int) fs->fs_bsize, NOCRED, &nbp);
-	brelse(nbp);
-	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
-	if (ip->i_effnlink == 0)
-		error = ENOENT;		/* Snapshot file unlinked */
-	else
-		vrele(vp);		/* Drop extra reference */
+	for (blockno = 0; blockno < NDADDR; blockno++) {
+		if (DIP(ip, i_db[blockno]) != 0)
+			continue;
+		error = UFS_BALLOC(vp, lblktosize(fs, blockno),
+		    fs->fs_bsize, KERNCRED, BA_CLRBUF, &bp);
+		if (error)
+			break;
+		error = readblock(vp, bp, blockno);
+		bawrite(bp);
+		if (error != 0)
+			break;
+	}
 done:
 	free(copy_fs->fs_csp, M_UFSMNT);
 	free(copy_fs, M_UFSMNT);
@@ -855,7 +861,7 @@ out:
 	MNT_IUNLOCK(mp);
 	if (error)
 		(void) ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
-	(void) ffs_syncvnode(vp, MNT_WAIT);
+	(void) ffs_syncvnode(vp, MNT_WAIT, 0);
 	if (error)
 		vput(vp);
 	else
@@ -1708,7 +1714,7 @@ ffs_snapremove(vp)
 	 * may find indirect pointers using the magic BLK_* values.
 	 */
 	if (DOINGSOFTDEP(vp))
-		ffs_syncvnode(vp, MNT_WAIT);
+		ffs_syncvnode(vp, MNT_WAIT, 0);
 #ifdef QUOTA
 	/*
 	 * Reenable disk quotas for ex-snapshot file.
@@ -1902,7 +1908,7 @@ retry:
 			bawrite(cbp);
 			if ((vtype == VDIR || dopersistence) &&
 			    ip->i_effnlink > 0)
-				(void) ffs_syncvnode(vp, MNT_WAIT);
+				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 			continue;
 		}
 		/*
@@ -1913,7 +1919,7 @@ retry:
 			bawrite(cbp);
 			if ((vtype == VDIR || dopersistence) &&
 			    ip->i_effnlink > 0)
-				(void) ffs_syncvnode(vp, MNT_WAIT);
+				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 			break;
 		}
 		savedcbp = cbp;
@@ -1931,7 +1937,7 @@ retry:
 		bawrite(savedcbp);
 		if ((vtype == VDIR || dopersistence) &&
 		    VTOI(vp)->i_effnlink > 0)
-			(void) ffs_syncvnode(vp, MNT_WAIT);
+			(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 	}
 	/*
 	 * If we have been unable to allocate a block in which to do
@@ -1987,14 +1993,14 @@ ffs_snapshot_mount(mp)
 			continue;
 		}
 		ip = VTOI(vp);
-		if ((ip->i_flags & SF_SNAPSHOT) == 0 || ip->i_size ==
+		if (!IS_SNAPSHOT(ip) || ip->i_size ==
 		    lblktosize(fs, howmany(fs->fs_size, fs->fs_frag))) {
-			if ((ip->i_flags & SF_SNAPSHOT) == 0) {
+			if (!IS_SNAPSHOT(ip)) {
 				reason = "non-snapshot";
 			} else {
 				reason = "old format snapshot";
 				(void)ffs_truncate(vp, (off_t)0, 0, NOCRED, td);
-				(void)ffs_syncvnode(vp, MNT_WAIT);
+				(void)ffs_syncvnode(vp, MNT_WAIT, 0);
 			}
 			printf("ffs_snapshot_mount: %s inode %d\n",
 			    reason, fs->fs_snapinum[snaploc]);
@@ -2250,7 +2256,7 @@ ffs_copyonwrite(devvp, bp)
 	int launched_async_io, prev_norunningbuf;
 	long saved_runningbufspace;
 
-	if (devvp != bp->b_vp && (VTOI(bp->b_vp)->i_flags & SF_SNAPSHOT) != 0)
+	if (devvp != bp->b_vp && IS_SNAPSHOT(VTOI(bp->b_vp)))
 		return (0);		/* Update on a snapshot file */
 	if (td->td_pflags & TDP_COWINPROGRESS)
 		panic("ffs_copyonwrite: recursive call");
@@ -2395,7 +2401,7 @@ ffs_copyonwrite(devvp, bp)
 			bawrite(cbp);
 			if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
 			    dopersistence) && ip->i_effnlink > 0)
-				(void) ffs_syncvnode(vp, MNT_WAIT);
+				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 			else
 				launched_async_io = 1;
 			continue;
@@ -2408,7 +2414,7 @@ ffs_copyonwrite(devvp, bp)
 			bawrite(cbp);
 			if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
 			    dopersistence) && ip->i_effnlink > 0)
-				(void) ffs_syncvnode(vp, MNT_WAIT);
+				(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 			else
 				launched_async_io = 1;
 			break;
@@ -2428,7 +2434,7 @@ ffs_copyonwrite(devvp, bp)
 		bawrite(savedcbp);
 		if ((devvp == bp->b_vp || bp->b_vp->v_type == VDIR ||
 		    dopersistence) && VTOI(vp)->i_effnlink > 0)
-			(void) ffs_syncvnode(vp, MNT_WAIT);
+			(void) ffs_syncvnode(vp, MNT_WAIT, NO_INO_UPDT);
 		else
 			launched_async_io = 1;
 	}
@@ -2478,7 +2484,7 @@ ffs_sync_snap(mp, waitfor)
 	}
 	TAILQ_FOREACH(ip, &sn->sn_head, i_nextsnap) {
 		vp = ITOV(ip);
-		ffs_syncvnode(vp, waitfor);
+		ffs_syncvnode(vp, waitfor, NO_INO_UPDT);
 	}
 	lockmgr(&sn->sn_lock, LK_RELEASE, NULL);
 }

Modified: stable/9/sys/ufs/ffs/ffs_softdep.c
==============================================================================
--- stable/9/sys/ufs/ffs/ffs_softdep.c	Wed Mar 28 21:21:19 2012	(r233629)
+++ stable/9/sys/ufs/ffs/ffs_softdep.c	Wed Mar 28 21:34:55 2012	(r233630)
@@ -2825,7 +2825,12 @@ softdep_prealloc(vp, waitok)
 {
 	struct ufsmount *ump;
 
-	if (DOINGSUJ(vp) == 0)
+	/*
+	 * Nothing to do if we are not running journaled soft updates.
+	 * If we currently hold the snapshot lock, we must avoid handling
+	 * other resources that could cause deadlock.
+	 */
+	if (DOINGSUJ(vp) == 0 || IS_SNAPSHOT(VTOI(vp)))
 		return (0);
 	ump = VFSTOUFS(vp->v_mount);
 	ACQUIRE_LOCK(&lk);
@@ -2842,7 +2847,7 @@ softdep_prealloc(vp, waitok)
 	 * work attached to it.
 	 */
 	if ((curthread->td_pflags & TDP_COWINPROGRESS) == 0)
-		ffs_syncvnode(vp, waitok);
+		ffs_syncvnode(vp, waitok, 0);
 	ACQUIRE_LOCK(&lk);
 	process_removes(vp);
 	process_truncates(vp);
@@ -2871,13 +2876,18 @@ softdep_prelink(dvp, vp)
 
 	ump = VFSTOUFS(dvp->v_mount);
 	mtx_assert(&lk, MA_OWNED);
-	if (journal_space(ump, 0))
+	/*
+	 * Nothing to do if we have sufficient journal space.
+	 * If we currently hold the snapshot lock, we must avoid
+	 * handling other resources that could cause deadlock.
+	 */
+	if (journal_space(ump, 0) || (vp && IS_SNAPSHOT(VTOI(vp))))
 		return;
 	stat_journal_low++;
 	FREE_LOCK(&lk);
 	if (vp)
-		ffs_syncvnode(vp, MNT_NOWAIT);
-	ffs_syncvnode(dvp, MNT_WAIT);
+		ffs_syncvnode(vp, MNT_NOWAIT, 0);
+	ffs_syncvnode(dvp, MNT_WAIT, 0);
 	ACQUIRE_LOCK(&lk);
 	/* Process vp before dvp as it may create .. removes. */
 	if (vp) {
@@ -4302,11 +4312,15 @@ inodedep_lookup_ip(ip)
 	struct inode *ip;
 {
 	struct inodedep *inodedep;
+	int dflags;
 
 	KASSERT(ip->i_nlink >= ip->i_effnlink,
 	    ("inodedep_lookup_ip: bad delta"));
-	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number,
-	    DEPALLOC, &inodedep);
+	dflags = DEPALLOC;
+	if (IS_SNAPSHOT(ip))
+		dflags |= NODELAY;
+	(void) inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags,
+	    &inodedep);
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
 
 	return (inodedep);
@@ -4694,7 +4708,7 @@ softdep_setup_inomapdep(bp, ip, newinum,
 	 * the cylinder group map from which it was allocated.
 	 */
 	ACQUIRE_LOCK(&lk);
-	if ((inodedep_lookup(mp, newinum, DEPALLOC|NODELAY, &inodedep)))
+	if ((inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep)))
 		panic("softdep_setup_inomapdep: dependency %p for new"
 		    "inode already exists", inodedep);
 	bmsafemap = bmsafemap_lookup(mp, bp, ino_to_cg(fs, newinum));
@@ -5435,6 +5449,7 @@ softdep_setup_allocindir_page(ip, lbn, b
 	struct allocindir *aip;
 	struct pagedep *pagedep;
 	struct mount *mp;
+	int dflags;
 
 	if (lbn != nbp->b_lblkno)
 		panic("softdep_setup_allocindir_page: lbn %jd != lblkno %jd",
@@ -5442,7 +5457,10 @@ softdep_setup_allocindir_page(ip, lbn, b
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_page");
 	mp = UFSTOVFS(ip->i_ump);
 	aip = newallocindir(ip, ptrno, newblkno, oldblkno, lbn);
-	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
+	dflags = DEPALLOC;
+	if (IS_SNAPSHOT(ip))
+		dflags |= NODELAY;
+	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
 	/*
 	 * If we are allocating a directory page, then we must
 	 * allocate an associated pagedep to track additions and
@@ -5472,11 +5490,15 @@ softdep_setup_allocindir_meta(nbp, ip, b
 	struct inodedep *inodedep;
 	struct allocindir *aip;
 	ufs_lbn_t lbn;
+	int dflags;
 
 	lbn = nbp->b_lblkno;
 	ASSERT_VOP_LOCKED(ITOV(ip), "softdep_setup_allocindir_meta");
 	aip = newallocindir(ip, ptrno, newblkno, 0, lbn);
-	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
+	dflags = DEPALLOC;
+	if (IS_SNAPSHOT(ip))
+		dflags |= NODELAY;
+	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
 	WORKLIST_INSERT(&nbp->b_dep, &aip->ai_block.nb_list);
 	if (setup_allocindir_phase2(bp, ip, inodedep, aip, lbn))
 		panic("softdep_setup_allocindir_meta: Block already existed");
@@ -6083,11 +6105,7 @@ softdep_journal_freeblocks(ip, cred, len
 	struct mount *mp;
 	ufs2_daddr_t extblocks, datablocks;
 	ufs_lbn_t tmpval, lbn, lastlbn;
-	int frags;
-	int lastoff, iboff;
-	int allocblock;
-	int error, i;
-	int needj;
+	int frags, lastoff, iboff, allocblock, needj, dflags, error, i;
 
 	fs = ip->i_fs;
 	mp = UFSTOVFS(ip->i_ump);
@@ -6105,7 +6123,10 @@ softdep_journal_freeblocks(ip, cred, len
 	 * we don't need to journal the block frees.  The canceled journals
 	 * for the allocations will suffice.
 	 */
-	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
+	dflags = DEPALLOC;
+	if (IS_SNAPSHOT(ip))
+		dflags |= NODELAY;
+	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
 	if ((inodedep->id_state & (UNLINKED | DEPCOMPLETE)) == UNLINKED &&
 	    length == 0)
 		needj = 0;
@@ -6230,7 +6251,7 @@ softdep_journal_freeblocks(ip, cred, len
 		*((struct ufs2_dinode *)bp->b_data +
 		    ino_to_fsbo(fs, ip->i_number)) = *ip->i_din2;
 	ACQUIRE_LOCK(&lk);
-	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
+	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
 	if ((inodedep->id_state & IOSTARTED) != 0)
 		panic("softdep_setup_freeblocks: inode busy");
 	/*
@@ -6308,7 +6329,7 @@ softdep_journal_freeblocks(ip, cred, len
 
 	}
 	ACQUIRE_LOCK(&lk);
-	inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
+	inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
 	TAILQ_INSERT_TAIL(&inodedep->id_freeblklst, freeblks, fb_next);
 	freeblks->fb_state |= DEPCOMPLETE | ONDEPLIST;
 	/*
@@ -6396,7 +6417,7 @@ softdep_setup_freeblocks(ip, length, fla
 	struct fs *fs;
 	ufs2_daddr_t extblocks, datablocks;
 	struct mount *mp;
-	int i, delay, error;
+	int i, delay, error, dflags;
 	ufs_lbn_t tmpval;
 	ufs_lbn_t lbn;
 
@@ -6461,7 +6482,10 @@ softdep_setup_freeblocks(ip, length, fla
 	 * Find and eliminate any inode dependencies.
 	 */
 	ACQUIRE_LOCK(&lk);
-	(void) inodedep_lookup(mp, ip->i_number, DEPALLOC, &inodedep);
+	dflags = DEPALLOC;
+	if (IS_SNAPSHOT(ip))
+		dflags |= NODELAY;
+	(void) inodedep_lookup(mp, ip->i_number, dflags, &inodedep);
 	if ((inodedep->id_state & IOSTARTED) != 0)
 		panic("softdep_setup_freeblocks: inode busy");
 	/*
@@ -8027,7 +8051,7 @@ softdep_setup_directory_add(bp, dp, diro
 	dap->da_pagedep = pagedep;
 	LIST_INSERT_HEAD(&pagedep->pd_diraddhd[DIRADDHASH(offset)], dap,
 	    da_pdlist);
-	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
+	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
 	/*
 	 * If we're journaling, link the diradd into the jaddref so it
 	 * may be completed after the journal entry is written.  Otherwise,
@@ -8629,8 +8653,7 @@ newdirrem(bp, dp, ip, isrmdir, prevdirre
 	 * the number of freefile and freeblks structures.
 	 */
 	ACQUIRE_LOCK(&lk);
-	if (!(ip->i_flags & SF_SNAPSHOT) &&
-	    dep_current[D_DIRREM] > max_softdeps / 2)
+	if (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2)
 		(void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS);
 	FREE_LOCK(&lk);
 	dirrem = malloc(sizeof(struct dirrem),
@@ -8864,11 +8887,11 @@ softdep_setup_directory_change(bp, dp, i
 	/*
 	 * Lookup the jaddref for this journal entry.  We must finish
 	 * initializing it and make the diradd write dependent on it.
-	 * If we're not journaling Put it on the id_bufwait list if the inode
-	 * is not yet written. If it is written, do the post-inode write
-	 * processing to put it on the id_pendinghd list.
+	 * If we're not journaling, put it on the id_bufwait list if the
+	 * inode is not yet written. If it is written, do the post-inode
+	 * write processing to put it on the id_pendinghd list.
 	 */
-	inodedep_lookup(mp, newinum, DEPALLOC, &inodedep);
+	inodedep_lookup(mp, newinum, DEPALLOC | NODELAY, &inodedep);
 	if (MOUNTEDSUJ(mp)) {
 		jaddref = (struct jaddref *)TAILQ_LAST(&inodedep->id_inoreflst,
 		    inoreflst);
@@ -8910,9 +8933,13 @@ softdep_change_linkcnt(ip)
 	struct inode *ip;	/* the inode with the increased link count */
 {
 	struct inodedep *inodedep;
+	int dflags;
 
 	ACQUIRE_LOCK(&lk);
-	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, DEPALLOC, &inodedep);
+	dflags = DEPALLOC;
+	if (IS_SNAPSHOT(ip))
+		dflags |= NODELAY;
+	inodedep_lookup(UFSTOVFS(ip->i_ump), ip->i_number, dflags, &inodedep);
 	if (ip->i_nlink < ip->i_effnlink)
 		panic("softdep_change_linkcnt: bad delta");
 	inodedep->id_nlinkdelta = ip->i_nlink - ip->i_effnlink;
@@ -11813,8 +11840,8 @@ restart:
 					pagedep_new_block = pagedep->pd_state & NEWBLOCK;
 					FREE_LOCK(&lk);
 					locked = 0;
-					if (pagedep_new_block &&
-					    (error = ffs_syncvnode(pvp, MNT_WAIT))) {
+					if (pagedep_new_block && (error =
+					    ffs_syncvnode(pvp, MNT_WAIT, 0))) {
 						vput(pvp);
 						return (error);
 					}
@@ -12542,22 +12569,25 @@ softdep_request_cleanup(fs, vp, cred, re
 	ufs2_daddr_t needed;
 	int error;
 
-	mp = vp->v_mount;
-	ump = VFSTOUFS(mp);
-	mtx_assert(UFS_MTX(ump), MA_OWNED);
-	if (resource == FLUSH_BLOCKS_WAIT)
-		stat_cleanup_blkrequests += 1;
-	else
-		stat_cleanup_inorequests += 1;
-
 	/*
 	 * If we are being called because of a process doing a
 	 * copy-on-write, then it is not safe to process any
 	 * worklist items as we will recurse into the copyonwrite
 	 * routine.  This will result in an incoherent snapshot.
+	 * If the vnode that we hold is a snapshot, we must avoid
+	 * handling other resources that could cause deadlock.
 	 */
-	if (curthread->td_pflags & TDP_COWINPROGRESS)
+	if ((curthread->td_pflags & TDP_COWINPROGRESS) || IS_SNAPSHOT(VTOI(vp)))
 		return (0);
+
+	if (resource == FLUSH_BLOCKS_WAIT)
+		stat_cleanup_blkrequests += 1;
+	else
+		stat_cleanup_inorequests += 1;
+
+	mp = vp->v_mount;
+	ump = VFSTOUFS(mp);
+	mtx_assert(UFS_MTX(ump), MA_OWNED);
 	UFS_UNLOCK(ump);
 	error = ffs_update(vp, 1);
 	if (error != 0) {
@@ -12652,7 +12682,7 @@ retry:
 				MNT_ILOCK(mp);
 				continue;
 			}
-			(void) ffs_syncvnode(lvp, MNT_NOWAIT);
+			(void) ffs_syncvnode(lvp, MNT_NOWAIT, 0);
 			vput(lvp);
 			MNT_ILOCK(mp);
 		}
@@ -12825,7 +12855,7 @@ clear_remove(td)
 				softdep_error("clear_remove: vget", error);
 				goto finish_write;
 			}
-			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
+			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
 				softdep_error("clear_remove: fsync", error);
 			bo = &vp->v_bufobj;
 			BO_LOCK(bo);
@@ -12908,10 +12938,10 @@ clear_inodedeps(td)
 		}
 		vfs_unbusy(mp);
 		if (ino == lastino) {
-			if ((error = ffs_syncvnode(vp, MNT_WAIT)))
+			if ((error = ffs_syncvnode(vp, MNT_WAIT, 0)))
 				softdep_error("clear_inodedeps: fsync1", error);
 		} else {
-			if ((error = ffs_syncvnode(vp, MNT_NOWAIT)))
+			if ((error = ffs_syncvnode(vp, MNT_NOWAIT, 0)))
 				softdep_error("clear_inodedeps: fsync2", error);
 			BO_LOCK(&vp->v_bufobj);
 			drain_output(vp);

Modified: stable/9/sys/ufs/ffs/ffs_vfsops.c
==============================================================================
--- stable/9/sys/ufs/ffs/ffs_vfsops.c	Wed Mar 28 21:21:19 2012	(r233629)
+++ stable/9/sys/ufs/ffs/ffs_vfsops.c	Wed Mar 28 21:34:55 2012	(r233630)
@@ -1505,7 +1505,7 @@ loop:
 			}
 			continue;
 		}
-		if ((error = ffs_syncvnode(vp, waitfor)) != 0)
+		if ((error = ffs_syncvnode(vp, waitfor, 0)) != 0)
 			allerror = error;
 		vput(vp);
 		MNT_ILOCK(mp);

Modified: stable/9/sys/ufs/ffs/ffs_vnops.c
==============================================================================
--- stable/9/sys/ufs/ffs/ffs_vnops.c	Wed Mar 28 21:21:19 2012	(r233629)
+++ stable/9/sys/ufs/ffs/ffs_vnops.c	Wed Mar 28 21:34:55 2012	(r233630)
@@ -184,7 +184,7 @@ ffs_fsync(struct vop_fsync_args *ap)
 	vp = ap->a_vp;
 	bo = &vp->v_bufobj;
 retry:
-	error = ffs_syncvnode(vp, ap->a_waitfor);
+	error = ffs_syncvnode(vp, ap->a_waitfor, 0);
 	if (error)
 		return (error);
 	if (ap->a_waitfor == MNT_WAIT && DOINGSOFTDEP(vp)) {
@@ -209,7 +209,7 @@ retry:
 }
 
 int
-ffs_syncvnode(struct vnode *vp, int waitfor)
+ffs_syncvnode(struct vnode *vp, int waitfor, int flags)
 {
 	struct inode *ip;
 	struct bufobj *bo;
@@ -300,7 +300,10 @@ next:
 	}
 	if (waitfor != MNT_WAIT) {
 		BO_UNLOCK(bo);
-		return (ffs_update(vp, 0));
+		if ((flags & NO_INO_UPDT) != 0)
+			return (0);
+		else
+			return (ffs_update(vp, 0));
 	}
 	/* Drain IO to see if we're done. */
 	bufobj_wwait(bo, 0, 0);
@@ -317,7 +320,7 @@ next:
 	 */
 	if (bo->bo_dirty.bv_cnt > 0) {
 		/* Write the inode after sync passes to flush deps. */
-		if (wait && DOINGSOFTDEP(vp)) {
+		if (wait && DOINGSOFTDEP(vp) && (flags & NO_INO_UPDT) == 0) {
 			BO_UNLOCK(bo);
 			ffs_update(vp, MNT_WAIT);
 			BO_LOCK(bo);
@@ -332,7 +335,9 @@ next:
 #endif
 	}
 	BO_UNLOCK(bo);
-	error = ffs_update(vp, MNT_WAIT);
+	error = 0;
+	if ((flags & NO_INO_UPDT) == 0)
+		error = ffs_update(vp, MNT_WAIT);
 	if (DOINGSUJ(vp))
 		softdep_journal_fsync(VTOI(vp));
 	return (error);

Modified: stable/9/sys/ufs/ufs/inode.h
==============================================================================
--- stable/9/sys/ufs/ufs/inode.h	Wed Mar 28 21:21:19 2012	(r233629)
+++ stable/9/sys/ufs/ufs/inode.h	Wed Mar 28 21:34:55 2012	(r233630)
@@ -158,6 +158,7 @@ struct inode {
 #define	SHORTLINK(ip) \
 	(((ip)->i_ump->um_fstype == UFS1) ? \
 	(caddr_t)(ip)->i_din1->di_db : (caddr_t)(ip)->i_din2->di_db)
+#define IS_SNAPSHOT(ip)		((ip)->i_flags & SF_SNAPSHOT)
 
 /*
  * Structure used to pass around logical block paths generated by