ffs snapshot lockup

Kostik Belousov kostikbel at gmail.com
Tue Oct 3 08:43:25 UTC 2006


On Mon, Oct 02, 2006 at 03:23:49PM -0400, Vivek Khera wrote:
> 
> On Sep 22, 2006, at 4:36 PM, Kris Kennaway wrote:
> 
> >Start by enabling INVARIANTS, INVARIANT_SUPPORT, DEBUG_LOCKS and
> >DEBUG_VFS_LOCKS, then run 'show lockedvnods' and 'alltrace' in DDB
> >(spammy, need that serial console), or at least trace the running
> >processes (show allpcpu) and those listed in lockedvnods.  Then call
> >doadump and save the core+kernel.debug when you reboot.
> 
> Well, what an exciting weekend we had!  Three lockups/panics: two  
> during running a dump (mksnap_ffs seems to be the culprit) and one  
> running background fsck after rebooting from the prior crash.
> 
> Details are posted at http://vivek.khera.org/scratch/crashlogs/
> 
> I have the crashdumps available to a kernel hacker upon request (i'd  
> rather not make them generally available to the public...)
> 
It seems that you have snapshotted fs exported by nfsd ? At least, 18a is
definitely the case. I have the patch (for current) that shall fix the issue.
In fact, you need two patches:

1. buffer ownership patch by Tor Egge (already in current, I think it shall
be MFCed before 6.2)

tegge       2006-10-02 02:06:27 UTC

  FreeBSD src repository

  Modified files:
    sys/kern             kern_lock.c vfs_bio.c 
    sys/sys              buf.h lockmgr.h 
  Log:
  If the buffer lock has waiters after the buffer has changed identity then
  getnewbuf() needs to drop the buffer in order to wake waiters that might
  sleep on the buffer in the context of the old identity.
  
  Revision  Changes    Path
  1.100     +15 -0     src/sys/kern/kern_lock.c
  1.510     +11 -0     src/sys/kern/vfs_bio.c
  1.194     +11 -0     src/sys/sys/buf.h
  1.51      +1 -0      src/sys/sys/lockmgr.h
_______________________________________________
cvs-all at freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/cvs-all
To unsubscribe, send any mail to "cvs-all-unsubscribe at freebsd.org"


Index: src/sys/kern/kern_lock.c
diff -u src/sys/kern/kern_lock.c:1.99 src/sys/kern/kern_lock.c:1.100
--- src/sys/kern/kern_lock.c:1.99	Tue Aug 15 18:29:01 2006
+++ src/sys/kern/kern_lock.c	Mon Oct  2 02:06:26 2006
@@ -566,6 +566,21 @@
 }
 
 /*
+ * Determine the number of waiters on a lock.
+ */
+int
+lockwaiters(lkp)
+	struct lock *lkp;
+{
+	int count;
+
+	mtx_lock(lkp->lk_interlock);
+	count = lkp->lk_waitcount;
+	mtx_unlock(lkp->lk_interlock);
+	return (count);
+}
+
+/*
  * Print out information about state of a lock. Used by VOP_PRINT
  * routines to display status about contained locks.
  */
Index: src/sys/kern/vfs_bio.c
diff -u src/sys/kern/vfs_bio.c:1.509 src/sys/kern/vfs_bio.c:1.510
--- src/sys/kern/vfs_bio.c:1.509	Wed Aug  9 17:43:26 2006
+++ src/sys/kern/vfs_bio.c	Mon Oct  2 02:06:26 2006
@@ -1890,6 +1890,17 @@
 		}
 
 		/*
+		 * Notify any waiters for the buffer lock about
+		 * identity change by freeing the buffer.
+		 */
+		if (qindex == QUEUE_CLEAN && BUF_LOCKWAITERS(bp) > 0) {
+			bp->b_flags |= B_INVAL;
+			bfreekva(bp);
+			brelse(bp);
+			goto restart;
+		}
+
+		/*
 		 * If we are overcomitted then recover the buffer and its
 		 * KVM space.  This occurs in rare situations when multiple
 		 * processes are blocked in getnewbuf() or allocbuf().
Index: src/sys/sys/buf.h
diff -u src/sys/sys/buf.h:1.193 src/sys/sys/buf.h:1.194
--- src/sys/sys/buf.h:1.193	Fri Mar 31 02:56:30 2006
+++ src/sys/sys/buf.h	Mon Oct  2 02:06:27 2006
@@ -371,6 +371,17 @@
 	return ret;
 }
 
+
+/*
+ * Find out the number of waiters on a lock.
+ */
+static __inline int BUF_LOCKWAITERS(struct buf *);
+static __inline int
+BUF_LOCKWAITERS(struct buf *bp)
+{
+	return (lockwaiters(&bp->b_lock));
+}
+
 #endif /* _KERNEL */
 
 struct buf_queue_head {
Index: src/sys/sys/lockmgr.h
diff -u src/sys/sys/lockmgr.h:1.50 src/sys/sys/lockmgr.h:1.51
--- src/sys/sys/lockmgr.h:1.50	Tue Aug 15 18:29:01 2006
+++ src/sys/sys/lockmgr.h	Mon Oct  2 02:06:27 2006
@@ -203,6 +203,7 @@
 void	lockmgr_printinfo(struct lock *);
 int	lockstatus(struct lock *, struct thread *);
 int	lockcount(struct lock *);
+int	lockwaiters(struct lock *);
 #ifdef DDB
 int	lockmgr_chain(struct thread *td, struct thread **ownerp);
 #endif

2. this one (it shall be applied in the sys/ufs; please, test and report
results)

Index: ffs/ffs_inode.c
===================================================================
RCS file: /usr/local/arch/ncvs/src/sys/ufs/ffs/ffs_inode.c,v
retrieving revision 1.106
diff -u -r1.106 ffs_inode.c
--- ffs/ffs_inode.c	5 Apr 2005 08:49:41 -0000	1.106
+++ ffs/ffs_inode.c	27 Sep 2006 08:29:18 -0000
@@ -66,9 +66,11 @@
  * IN_ACCESS, IN_UPDATE, and IN_CHANGE flags respectively.  Write the inode
  * to disk if the IN_MODIFIED flag is set (it may be set initially, or by
  * the timestamp update).  The IN_LAZYMOD flag is set to force a write
- * later if not now.  If we write now, then clear both IN_MODIFIED and
- * IN_LAZYMOD to reflect the presumably successful write, and if waitfor is
- * set, then wait for the write to complete.
+ * later if not now.  The IN_LAZYACCESS is set instead of IN_MODIFIED if fs
+ * is currently suspending/suspended and vnode has been accessed.  If we
+ * write now, then clear IN_MODIFIED, IN_LAZYACCESS and IN_LAZYMOD to reflect
+ * the presumably successful write, and if waitfor is set, then wait for the
+ * write to complete.
  */
 int
 ffs_update(vp, waitfor)
@@ -80,12 +82,12 @@
 	struct inode *ip;
 	int error;
 
-	ASSERT_VOP_LOCKED(vp, "ffs_update");
+	ASSERT_VOP_ELOCKED(vp, "ffs_update");
 	ufs_itimes(vp);
 	ip = VTOI(vp);
 	if ((ip->i_flag & IN_MODIFIED) == 0 && waitfor == 0)
 		return (0);
-	ip->i_flag &= ~(IN_LAZYMOD | IN_MODIFIED);
+	ip->i_flag &= ~(IN_LAZYACCESS | IN_LAZYMOD | IN_MODIFIED);
 	fs = ip->i_fs;
 	if (fs->fs_ronly)
 		return (0);
Index: ffs/ffs_snapshot.c
===================================================================
RCS file: /usr/local/arch/ncvs/src/sys/ufs/ffs/ffs_snapshot.c,v
retrieving revision 1.130
diff -u -r1.130 ffs_snapshot.c
--- ffs/ffs_snapshot.c	26 Sep 2006 04:19:11 -0000	1.130
+++ ffs/ffs_snapshot.c	27 Sep 2006 08:29:18 -0000
@@ -2309,15 +2309,16 @@
 	return (bp->b_error);
 }
 
-
 /*
  * Process file deletes that were deferred by ufs_inactive() due to
- * the file system being suspended.
+ * the file system being suspended. Transfer IN_LAZYACCESS into
+ * IN_MODIFIED for vnodes that were accessed during suspension.
  */
 static void
 process_deferred_inactive(struct mount *mp)
 {
 	struct vnode *vp, *mvp;
+	struct inode *ip;
 	struct thread *td;
 	int error;
 
@@ -2327,9 +2328,14 @@
  loop:
 	MNT_VNODE_FOREACH(vp, mp, mvp) {
 		VI_LOCK(vp);
-		if ((vp->v_iflag & (VI_DOOMED | VI_OWEINACT)) != VI_OWEINACT ||
-		    vp->v_usecount > 0 ||
-		    vp->v_type == VNON) {
+		/*
+		 * IN_LAZYACCESS is checked here without holding any
+		 * vnode lock, but this flag is set only while holding
+		 * vnode interlock.
+		 */
+		if (vp->v_type == VNON || (vp->v_iflag & VI_DOOMED) != 0 ||
+		    ((VTOI(vp)->i_flag & IN_LAZYACCESS) == 0 &&
+		     ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0))) {
 			VI_UNLOCK(vp);
 			continue;
 		}
@@ -2344,8 +2350,13 @@
 			MNT_VNODE_FOREACH_ABORT_ILOCKED(mp, mvp);
 			goto loop;
 		}
+		ip = VTOI(vp);
+		if ((ip->i_flag & IN_LAZYACCESS) != 0) {
+			ip->i_flag &= ~IN_LAZYACCESS;
+			ip->i_flag |= IN_MODIFIED;
+		}
 		VI_LOCK(vp);
-		if ((vp->v_iflag & VI_OWEINACT) == 0) {
+		if ((vp->v_iflag & VI_OWEINACT) == 0 || vp->v_usecount > 0) {
 			VI_UNLOCK(vp);
 			VOP_UNLOCK(vp, 0, td);
 			vdrop(vp);
Index: ffs/ffs_vnops.c
===================================================================
RCS file: /usr/local/arch/ncvs/src/sys/ufs/ffs/ffs_vnops.c,v
retrieving revision 1.160
diff -u -r1.160 ffs_vnops.c
--- ffs/ffs_vnops.c	5 May 2006 19:58:36 -0000	1.160
+++ ffs/ffs_vnops.c	27 Sep 2006 08:29:18 -0000
@@ -593,8 +593,11 @@
 	}
 
 	if ((error == 0 || uio->uio_resid != orig_resid) &&
-	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
+	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
+		VI_LOCK(vp);
 		ip->i_flag |= IN_ACCESS;
+		VI_UNLOCK(vp);
+	}
 	return (error);
 }
 
@@ -989,8 +992,11 @@
 	}
 
 	if ((error == 0 || uio->uio_resid != orig_resid) &&
-	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0)
+	    (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) {
+		VI_LOCK(vp);
 		ip->i_flag |= IN_ACCESS;
+		VI_UNLOCK(vp);
+	}
 	return (error);
 }
 
Index: ufs/inode.h
===================================================================
RCS file: /usr/local/arch/ncvs/src/sys/ufs/ufs/inode.h,v
retrieving revision 1.50
diff -u -r1.50 inode.h
--- ufs/inode.h	26 Sep 2006 04:15:59 -0000	1.50
+++ ufs/inode.h	27 Sep 2006 08:29:18 -0000
@@ -55,6 +55,13 @@
  * is the permanent meta-data associated with the file which is read in
  * from the permanent dinode from long term storage when the file becomes
  * active, and is put back when the file is no longer being used.
+ *
+ * An inode may only be changed while holding either the exclusive
+ * vnode lock or the shared vnode lock and the vnode interlock. We use
+ * the latter only for "read" and "get" operations that require
+ * changing i_flag, or a timestamp. This locking protocol allows executing
+ * those operations without having to upgrade the vnode lock from shared to
+ * exclusive.
  */
 struct inode {
 	TAILQ_ENTRY(inode) i_nextsnap; /* snapshot file list. */
@@ -119,6 +126,8 @@
 #define	IN_RENAME	0x0010		/* Inode is being renamed. */
 #define	IN_LAZYMOD	0x0040		/* Modified, but don't write yet. */
 #define	IN_SPACECOUNTED	0x0080		/* Blocks to be freed in free count. */
+#define	IN_LAZYACCESS	0x0100		/* Process IN_ACCESS after the
+					   suspension finished */
 
 #define i_devvp i_ump->um_devvp
 #define i_umbufobj i_ump->um_bo
Index: ufs/ufs_vnops.c
===================================================================
RCS file: /usr/local/arch/ncvs/src/sys/ufs/ufs/ufs_vnops.c,v
retrieving revision 1.278
diff -u -r1.278 ufs_vnops.c
--- ufs/ufs_vnops.c	20 Aug 2006 10:52:44 -0000	1.278
+++ ufs/ufs_vnops.c	27 Sep 2006 08:29:18 -0000
@@ -128,31 +128,51 @@
 {
 	struct inode *ip;
 	struct timespec ts;
+	int mnt_locked;
 
 	ip = VTOI(vp);
+	mnt_locked = 0;
+	if ((vp->v_mount->mnt_flag & MNT_RDONLY) != 0) {
+		VI_LOCK(vp);
+		goto out;
+	}
+
+	MNT_ILOCK(vp->v_mount);		/* For reading of mnt_kern_flags */
+	mnt_locked = 1;
+	VI_LOCK(vp);
 	if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
-		return;
+		goto out_unl;
+
 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp))
 		ip->i_flag |= IN_LAZYMOD;
-	else
+	else if (((vp->v_mount->mnt_kern_flag &
+			(MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) ||
+		 (ip->i_flag & (IN_CHANGE | IN_UPDATE)))
 		ip->i_flag |= IN_MODIFIED;
-	if ((vp->v_mount->mnt_flag & MNT_RDONLY) == 0) {
-		vfs_timestamp(&ts);
-		if (ip->i_flag & IN_ACCESS) {
-			DIP_SET(ip, i_atime, ts.tv_sec);
-			DIP_SET(ip, i_atimensec, ts.tv_nsec);
-		}
-		if (ip->i_flag & IN_UPDATE) {
-			DIP_SET(ip, i_mtime, ts.tv_sec);
-			DIP_SET(ip, i_mtimensec, ts.tv_nsec);
-			ip->i_modrev++;
-		}
-		if (ip->i_flag & IN_CHANGE) {
-			DIP_SET(ip, i_ctime, ts.tv_sec);
-			DIP_SET(ip, i_ctimensec, ts.tv_nsec);
-		}
+	else if (ip->i_flag & IN_ACCESS)
+		ip->i_flag |= IN_LAZYACCESS;
+	
+	vfs_timestamp(&ts);
+	if (ip->i_flag & IN_ACCESS) {
+		DIP_SET(ip, i_atime, ts.tv_sec);
+		DIP_SET(ip, i_atimensec, ts.tv_nsec);
+	}
+	if (ip->i_flag & IN_UPDATE) {
+		DIP_SET(ip, i_mtime, ts.tv_sec);
+		DIP_SET(ip, i_mtimensec, ts.tv_nsec);
+		ip->i_modrev++;
+	}
+	if (ip->i_flag & IN_CHANGE) {
+		DIP_SET(ip, i_ctime, ts.tv_sec);
+		DIP_SET(ip, i_ctimensec, ts.tv_nsec);
 	}
+
+ out:
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
+ out_unl:
+	VI_UNLOCK(vp);
+	if (mnt_locked)
+		MNT_IUNLOCK(vp->v_mount);
 }
 
 /*
@@ -266,11 +286,15 @@
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
+	int usecount;
 
 	VI_LOCK(vp);
-	if (vp->v_usecount > 1)
-		ufs_itimes(vp);
+	usecount = vp->v_usecount;
 	VI_UNLOCK(vp);
+
+	if (usecount > 1)
+		ufs_itimes(vp);
+
 	return (0);
 }
 
@@ -378,8 +402,10 @@
 	if (ip->i_ump->um_fstype == UFS1) {
 		vap->va_rdev = ip->i_din1->di_rdev;
 		vap->va_size = ip->i_din1->di_size;
+		VI_LOCK(vp);
 		vap->va_atime.tv_sec = ip->i_din1->di_atime;
 		vap->va_atime.tv_nsec = ip->i_din1->di_atimensec;
+		VI_UNLOCK(vp);
 		vap->va_mtime.tv_sec = ip->i_din1->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din1->di_ctime;
@@ -390,8 +416,10 @@
 	} else {
 		vap->va_rdev = ip->i_din2->di_rdev;
 		vap->va_size = ip->i_din2->di_size;
+		VI_LOCK(vp);
 		vap->va_atime.tv_sec = ip->i_din2->di_atime;
 		vap->va_atime.tv_nsec = ip->i_din2->di_atimensec;
+		VI_UNLOCK(vp);
 		vap->va_mtime.tv_sec = ip->i_din2->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din2->di_ctime;
@@ -1992,11 +2020,15 @@
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
+	int usecount;
 
 	VI_LOCK(vp);
-	if (vp->v_usecount > 1)
-		ufs_itimes(vp);
+	usecount = vp->v_usecount;
 	VI_UNLOCK(vp);
+
+	if (usecount > 1)
+		ufs_itimes(vp);
+
 	return (fifo_specops.vop_close(ap));
 }
 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 187 bytes
Desc: not available
Url : http://lists.freebsd.org/pipermail/freebsd-stable/attachments/20061003/44256a6f/attachment.pgp


More information about the freebsd-stable mailing list