Nasty non-recursive lockmgr panic on softdep only enabled UFS
partition when filesystem full
Garrett Cooper
yanegomi at gmail.com
Thu May 5 19:49:50 UTC 2011
On Thu, May 5, 2011 at 10:36 AM, Kostik Belousov <kostikbel at gmail.com> wrote:
> On Thu, May 05, 2011 at 10:23:47AM -0700, Garrett Cooper wrote:
>> On May 4, 2011, at 2:07 AM, Kostik Belousov wrote:
>>
>> > On Tue, May 03, 2011 at 11:58:49PM -0700, Garrett Cooper wrote:
>> >> On Tue, May 3, 2011 at 11:42 PM, Garrett Cooper <yanegomi at gmail.com> wrote:
>> >>> On Tue, May 3, 2011 at 10:59 PM, Kirk McKusick <mckusick at mckusick.com> wrote:
>> >>>>> Date: Tue, 3 May 2011 22:40:26 -0700
>> >>>>> Subject: Nasty non-recursive lockmgr panic on softdep only enabled UFS
>> >>>>> partition when filesystem full
>> >>>>> From: Garrett Cooper <yanegomi at gmail.com>
>> >>>>> To: Jeff Roberson <jeff at freebsd.org>,
>> >>>>> Marshall Kirk McKusick <mckusick at mckusick.com>
>> >>>>> Cc: FreeBSD Current <freebsd-current at freebsd.org>
>> >>>>>
>> >>>>> Hi Jeff and Dr. McKusick,
>> >>>>> Ran into this panic when /usr ran out of space doing a make
>> >>>>> universe on amd64/r221219 (it took ~15 minutes for the panic to occur
>> >>>>> after the filesystem ran out of space -- wasn't quite sure what it was
>> >>>>> doing at the time):
>> >>>>>
>> >>>>> ...
>> >>>>>
>> >>>>> Let me know what other commands you would like for me to run in kgdb.
>> >>>>> Thanks,
>> >>>>> -Garrett
>> >>>>
>> >>>> You did not indicate whether you are running an 8.X system or a 9-current
>> >>>> system. It would be helpful to know that.
>> >>>
>> >>> I've actually been running CURRENT for a few years now, but you're right --
>> >>> I didn't mention that part.
>> >>>
>> >>>> Jeff thinks that there may be a potential race in the locking code for
>> >>>> softdep_request_cleanup. If so, this patch for 9-current should fix it:
>> >>>>
>> >>>> Index: ffs_softdep.c
>> >>>> ===================================================================
>> >>>> --- ffs_softdep.c (revision 221385)
>> >>>> +++ ffs_softdep.c (working copy)
>> >>>> @@ -11380,7 +11380,8 @@
>> >>>> continue;
>> >>>> }
>> >>>> MNT_IUNLOCK(mp);
>> >>>> - if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) {
>> >>>> + if (vget(lvp, LK_EXCLUSIVE | LK_NOWAIT | LK_INTERLOCK,
>> >>>> + curthread)) {
>> >>>> MNT_ILOCK(mp);
>> >>>> continue;
>> >>>> }
>> >>>>
>> >>>> If you are running an 8.X system, hopefully you will be able to apply it.
>> >>>
>> >>> I've applied it, rebuilt and installed the kernel, and trying to
>> >>> repro the case again. Will let you know how things go!
>> >>
>> >> Happened again with the change. It's really easy to repro:
>> >>
>> >> 1. Get a filesystem with UFS+SU
>> >> 2. Execute something that does a large number of small writes to a partition.
>> >> 3. 'dd if=/dev/zero of=FOO bs=10m' on the same partition
>> >>
>> >> The kernel will panic with the issue I discussed above.
>> >> Thanks!
>> >
>> > Jeff' change is required to avoid LORs, but it is not sufficient to
>> > prevent recursion. We must skip the vnode supplied as a parameter to
>> > softdep_request_cleanup(). Theoretically, other vnodes might be also
>> > locked by curthread, thus I think the change below is needed. Try this.
>> >
>> > diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
>> > index a6d4441..25fa5d6 100644
>> > --- a/sys/ufs/ffs/ffs_softdep.c
>> > +++ b/sys/ufs/ffs/ffs_softdep.c
>> > @@ -11380,7 +11380,9 @@ retry:
>> > continue;
>> > }
>> > MNT_IUNLOCK(mp);
>> > - if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) {
>> > + if (VOP_ISLOCKED(lvp) ||
>> > + vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
>> > + curthread)) {
>> > MNT_ILOCK(mp);
>> > continue;
>> > }
>>
>> Ran into the same panic after I applied the patch above with the repro steps I described before. One thing that I noticed is that the issue isn't as easy to reproduce unless you add the dd in parallel with the make operation.
>
> Well, I misread your original report. Also, there is another issue
> that is easily reproducable in similar situation. The latest patch
> is below.
>
> diff --git a/sys/sys/mount.h b/sys/sys/mount.h
> index 231e3d6..f064053 100644
> --- a/sys/sys/mount.h
> +++ b/sys/sys/mount.h
> @@ -366,6 +366,8 @@ void __mnt_vnode_markerfree(struct vnode **mvp, struct mount *mp);
> #define MNT_LAZY 3 /* push data not written by filesystem syncer */
> #define MNT_SUSPEND 4 /* Suspend file system after sync */
>
> +#define MNT_WAIT_ADV 0x10000000 /* MNT_WAIT prevent deadlock */
> +
> /*
> * Generic file handle
> */
> diff --git a/sys/ufs/ffs/ffs_alloc.c b/sys/ufs/ffs/ffs_alloc.c
> index e60514d..87837cc 100644
> --- a/sys/ufs/ffs/ffs_alloc.c
> +++ b/sys/ufs/ffs/ffs_alloc.c
> @@ -420,13 +420,13 @@ nospace:
> */
> if (reclaimed == 0) {
> reclaimed = 1;
> - softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT);
> - UFS_UNLOCK(ump);
> if (bp) {
> + UFS_UNLOCK(ump);
> brelse(bp);
> bp = NULL;
> + UFS_LOCK(ump);
> }
> - UFS_LOCK(ump);
> + softdep_request_cleanup(fs, vp, cred, FLUSH_BLOCKS_WAIT);
> goto retry;
> }
> UFS_UNLOCK(ump);
> diff --git a/sys/ufs/ffs/ffs_extern.h b/sys/ufs/ffs/ffs_extern.h
> index d819c8a..d12e1dc 100644
> --- a/sys/ufs/ffs/ffs_extern.h
> +++ b/sys/ufs/ffs/ffs_extern.h
> @@ -141,7 +141,7 @@ void softdep_setup_inofree(struct mount *, struct buf *, ino_t,
> void softdep_setup_sbupdate(struct ufsmount *, struct fs *, struct buf *);
> void *softdep_setup_trunc(struct vnode *vp, off_t length, int flags);
> void softdep_fsync_mountdev(struct vnode *);
> -int softdep_sync_metadata(struct vnode *);
> +int softdep_sync_metadata(struct vnode *, int flags);
> int softdep_process_worklist(struct mount *, int);
> int softdep_fsync(struct vnode *);
> int softdep_waitidle(struct mount *);
> diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c
> index a6d4441..0b66e68 100644
> --- a/sys/ufs/ffs/ffs_softdep.c
> +++ b/sys/ufs/ffs/ffs_softdep.c
> @@ -492,7 +492,7 @@ softdep_flushworklist(oldmnt, countp, td)
> }
>
> int
> -softdep_sync_metadata(struct vnode *vp)
> +softdep_sync_metadata(struct vnode *vp, int flags)
> {
>
> return (0);
> @@ -733,7 +733,7 @@ static void unlinked_inodedep(struct mount *, struct inodedep *);
> static void clear_unlinked_inodedep(struct inodedep *);
> static struct inodedep *first_unlinked_inodedep(struct ufsmount *);
> static int flush_pagedep_deps(struct vnode *, struct mount *,
> - struct diraddhd *);
> + struct diraddhd *, int);
> static void free_pagedep(struct pagedep *);
> static int flush_newblk_dep(struct vnode *, struct mount *, ufs_lbn_t);
> static int flush_inodedep_deps(struct mount *, ino_t);
> @@ -10662,7 +10662,7 @@ restart:
> * associated with the file. If any I/O errors occur, they are returned.
> */
> int
> -softdep_sync_metadata(struct vnode *vp)
> +softdep_sync_metadata(struct vnode *vp, int flags)
> {
> struct pagedep *pagedep;
> struct allocindir *aip;
> @@ -10792,7 +10792,8 @@ loop:
> continue;
> if ((error =
> flush_pagedep_deps(vp, wk->wk_mp,
> - &pagedep->pd_diraddhd[i]))) {
> + &pagedep->pd_diraddhd[i],
> + flags))) {
> FREE_LOCK(&lk);
> goto loop_end;
> }
> @@ -11056,10 +11057,11 @@ flush_newblk_dep(vp, mp, lbn)
> * Called with splbio blocked.
> */
> static int
> -flush_pagedep_deps(pvp, mp, diraddhdp)
> +flush_pagedep_deps(pvp, mp, diraddhdp, flags)
> struct vnode *pvp;
> struct mount *mp;
> struct diraddhd *diraddhdp;
> + int flags;
> {
> struct inodedep *inodedep;
> struct inoref *inoref;
> @@ -11069,8 +11071,13 @@ flush_pagedep_deps(pvp, mp, diraddhdp)
> int error = 0;
> struct buf *bp;
> ino_t inum;
> + int lkflags;
>
> ump = VFSTOUFS(mp);
> + lkflags = LK_EXCLUSIVE;
> + if ((flags & MNT_WAIT_ADV) != 0)
> + lkflags |= LK_NOWAIT;
> +
> restart:
> while ((dap = LIST_FIRST(diraddhdp)) != NULL) {
> /*
> @@ -11112,7 +11119,7 @@ restart:
> }
> if (dap->da_state & MKDIR_BODY) {
> FREE_LOCK(&lk);
> - if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
> + if ((error = ffs_vgetf(mp, inum, lkflags, &vp,
> FFSV_FORCEINSMQ)))
> break;
> error = flush_newblk_dep(vp, mp, 0);
> @@ -11176,7 +11183,7 @@ retry:
> */
> if (dap == LIST_FIRST(diraddhdp)) {
> FREE_LOCK(&lk);
> - if ((error = ffs_vgetf(mp, inum, LK_EXCLUSIVE, &vp,
> + if ((error = ffs_vgetf(mp, inum, lkflags, &vp,
> FFSV_FORCEINSMQ)))
> break;
> error = ffs_update(vp, MNT_WAIT);
> @@ -11379,17 +11386,17 @@ retry:
> VI_UNLOCK(lvp);
> continue;
> }
> - MNT_IUNLOCK(mp);
> - if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK, curthread)) {
> - MNT_ILOCK(mp);
> + if (vget(lvp, LK_EXCLUSIVE | LK_INTERLOCK | LK_NOWAIT,
> + curthread)) {
> continue;
> }
> + MNT_IUNLOCK(mp);
> if (lvp->v_vflag & VV_NOSYNC) { /* unlinked */
> vput(lvp);
> MNT_ILOCK(mp);
> continue;
> }
> - (void) ffs_syncvnode(lvp, MNT_WAIT);
> + (void) ffs_syncvnode(lvp, MNT_WAIT | MNT_WAIT_ADV);
> vput(lvp);
> MNT_ILOCK(mp);
> }
> diff --git a/sys/ufs/ffs/ffs_vnops.c b/sys/ufs/ffs/ffs_vnops.c
> index cf6a5a8..c73e2a5 100644
> --- a/sys/ufs/ffs/ffs_vnops.c
> +++ b/sys/ufs/ffs/ffs_vnops.c
> @@ -216,9 +216,11 @@ ffs_syncvnode(struct vnode *vp, int waitfor)
> struct bufobj *bo;
> struct buf *bp;
> struct buf *nbp;
> - int s, error, wait, passes, skipmeta;
> + int s, error, wait, passes, skipmeta, wait_adv;
> ufs_lbn_t lbn;
>
> + wait_adv = waitfor & MNT_WAIT_ADV;
> + waitfor &= ~MNT_WAIT_ADV;
> wait = (waitfor == MNT_WAIT);
> lbn = lblkno(ip->i_fs, (ip->i_size + ip->i_fs->fs_bsize - 1));
> bo = &vp->v_bufobj;
> @@ -328,7 +330,7 @@ loop:
> * with the vnode has been written.
> */
> splx(s);
> - if ((error = softdep_sync_metadata(vp)) != 0)
> + if ((error = softdep_sync_metadata(vp, wait_adv)) != 0)
> return (error);
> s = splbio();
Things look ok with that patch and the one that Jeff provided for
the LOR, taking into account your style change with the flag list.
Thanks!
-Garrett
More information about the freebsd-current
mailing list