svn commit: r247387 - head/sys/ufs/ffs

Konstantin Belousov kib at FreeBSD.org
Wed Feb 27 07:31:24 UTC 2013


Author: kib
Date: Wed Feb 27 07:31:23 2013
New Revision: 247387
URL: http://svnweb.freebsd.org/changeset/base/247387

Log:
  An inode block must not be blockingly read while cg block is owned.
  The order is inode buffer lock -> snaplk -> cg buffer lock, reversing
  the order causes deadlocks.
  
  Inode block must not be written while cg block buffer is owned. The
  FFS copy on write needs to allocate a block to copy the content of the
  inode block, and the cylinder group selected for the allocation might
  be the same as the owned cg block.  The reserved block detection code
  in the ffs_copyonwrite() and ffs_bp_snapblk() is unable to detect the
  situation, because the locked cg buffer is not exposed to it.
  
  In order to maintain the dependency between initialized inode block
  and the cg_initediblk pointer, look up the inode buffer in
  non-blocking mode. If succeeded, brelse cg block, initialize the inode
  block and write it.  After the write is finished, reread cg block and
  update the cg_initediblk.
  
  If inode block is already locked by another thread, let the another
  thread initialize it.  If another thread raced with us after we
  started writing inode block, the situation is detected by an update of
  cg_initediblk.  Note that double-initialization of the inode block is
  harmless, the block cannot be used until cg_initediblk is incremented.
  
  Sponsored by:	The FreeBSD Foundation
  In collaboration with:	pho
  Reviewed by:	mckusick
  MFC after:	1 month
  X-MFC-note:	after r246877

Modified:
  head/sys/ufs/ffs/ffs_alloc.c

Modified: head/sys/ufs/ffs/ffs_alloc.c
==============================================================================
--- head/sys/ufs/ffs/ffs_alloc.c	Wed Feb 27 06:53:15 2013	(r247386)
+++ head/sys/ufs/ffs/ffs_alloc.c	Wed Feb 27 07:31:23 2013	(r247387)
@@ -1790,6 +1790,17 @@ fail:
 	return (0);
 }
 
+static inline struct buf *
+getinobuf(struct inode *ip, u_int cg, u_int32_t cginoblk, int gbflags)
+{
+	struct fs *fs;
+
+	fs = ip->i_fs;
+	return (getblk(ip->i_devvp, fsbtodb(fs, ino_to_fsba(fs,
+	    cg * fs->fs_ipg + cginoblk)), (int)fs->fs_bsize, 0, 0,
+	    gbflags));
+}
+
 /*
  * Determine whether an inode can be allocated.
  *
@@ -1814,9 +1825,11 @@ ffs_nodealloccg(ip, cg, ipref, mode, unu
 	u_int8_t *inosused, *loc;
 	struct ufs2_dinode *dp2;
 	int error, start, len, i;
+	u_int32_t old_initediblk;
 
 	fs = ip->i_fs;
 	ump = ip->i_ump;
+check_nifree:
 	if (fs->fs_cs(fs, cg).cs_nifree == 0)
 		return (0);
 	UFS_UNLOCK(ump);
@@ -1828,13 +1841,13 @@ ffs_nodealloccg(ip, cg, ipref, mode, unu
 		return (0);
 	}
 	cgp = (struct cg *)bp->b_data;
+restart:
 	if (!cg_chkmagic(cgp) || cgp->cg_cs.cs_nifree == 0) {
 		brelse(bp);
 		UFS_LOCK(ump);
 		return (0);
 	}
 	bp->b_xflags |= BX_BKGRDWRITE;
-	cgp->cg_old_time = cgp->cg_time = time_second;
 	inosused = cg_inosused(cgp);
 	if (ipref) {
 		ipref %= fs->fs_ipg;
@@ -1856,7 +1869,6 @@ ffs_nodealloccg(ip, cg, ipref, mode, unu
 		}
 	}
 	ipref = (loc - inosused) * NBBY + ffs(~*loc) - 1;
-	cgp->cg_irotor = ipref;
 gotit:
 	/*
 	 * Check to see if we need to initialize more inodes.
@@ -1864,9 +1876,37 @@ gotit:
 	if (fs->fs_magic == FS_UFS2_MAGIC &&
 	    ipref + INOPB(fs) > cgp->cg_initediblk &&
 	    cgp->cg_initediblk < cgp->cg_niblk) {
-		ibp = getblk(ip->i_devvp, fsbtodb(fs,
-		    ino_to_fsba(fs, cg * fs->fs_ipg + cgp->cg_initediblk)),
-		    (int)fs->fs_bsize, 0, 0, 0);
+		old_initediblk = cgp->cg_initediblk;
+
+		/*
+		 * Free the cylinder group lock before writing the
+		 * initialized inode block.  Entering the
+		 * babarrierwrite() with the cylinder group lock
+		 * causes lock order violation between the lock and
+		 * snaplk.
+		 *
+		 * Another thread can decide to initialize the same
+		 * inode block, but whichever thread first gets the
+		 * cylinder group lock after writing the newly
+		 * allocated inode block will update it and the other
+		 * will realize that it has lost and leave the
+		 * cylinder group unchanged.
+		 */
+		ibp = getinobuf(ip, cg, old_initediblk, GB_LOCK_NOWAIT);
+		brelse(bp);
+		if (ibp == NULL) {
+			/*
+			 * The inode block buffer is already owned by
+			 * another thread, which must initialize it.
+			 * Wait on the buffer to allow another thread
+			 * to finish the updates, with dropped cg
+			 * buffer lock, then retry.
+			 */
+			ibp = getinobuf(ip, cg, old_initediblk, 0);
+			brelse(ibp);
+			UFS_LOCK(ump);
+			goto check_nifree;
+		}
 		bzero(ibp->b_data, (int)fs->fs_bsize);
 		dp2 = (struct ufs2_dinode *)(ibp->b_data);
 		for (i = 0; i < INOPB(fs); i++) {
@@ -1883,8 +1923,29 @@ gotit:
 		 * loading of newly created filesystems.
 		 */
 		babarrierwrite(ibp);
-		cgp->cg_initediblk += INOPB(fs);
+
+		/*
+		 * After the inode block is written, try to update the
+		 * cg initediblk pointer.  If another thread beat us
+		 * to it, then leave it unchanged as the other thread
+		 * has already set it correctly.
+		 */
+		error = bread(ip->i_devvp, fsbtodb(fs, cgtod(fs, cg)),
+		    (int)fs->fs_cgsize, NOCRED, &bp);
+		UFS_LOCK(ump);
+		ACTIVECLEAR(fs, cg);
+		UFS_UNLOCK(ump);
+		if (error != 0) {
+			brelse(bp);
+			return (error);
+		}
+		cgp = (struct cg *)bp->b_data;
+		if (cgp->cg_initediblk == old_initediblk)
+			cgp->cg_initediblk += INOPB(fs);
+		goto restart;
 	}
+	cgp->cg_old_time = cgp->cg_time = time_second;
+	cgp->cg_irotor = ipref;
 	UFS_LOCK(ump);
 	ACTIVECLEAR(fs, cg);
 	setbit(inosused, ipref);


More information about the svn-src-all mailing list