Deadlock from fsx/mksnap_ffs

Kris Kennaway kris at obsecurity.org
Thu Feb 23 16:18:05 PST 2006


On Thu, Feb 23, 2006 at 06:13:29PM -0600, Greg Rivers wrote:
> On Thu, 23 Feb 2006, Kris Kennaway wrote:
> 
> >>I believe this issue has made it onto the show-stopper list for
> >>6.1-RELEASE and is being actively worked on.
> >
> >It's on the todo list, but I don't think it's being worked on yet.
> >The main problem is that we need a way to reproduce it on command.
> >I'd forgotten that snapshots are involved, so maybe it's just a matter
> >of running lots of mksnap_ffs while I/O is in progress.
> >
> >kris
> >
> 
> It happens with or without snapshots, but snapshots are a lot more likely 
> to make it happen.  In my case, approximately 1 in 3 snapshots will do it. 
> Without snapshots, I get a deadlock about every ten days in a population 
> of three hosts.
> 
> Tor Egge and Don Lewis were kind enough to work with me off-list for a bit 
> last December.  They analyzed several of the core files I produced and I 
> think they have a fair understanding of what the problems are.  But I 
> wouldn't presume to put words in their mouths; perhaps they'll give us an 
> update.  I see from the todo list that Tor may already be working on the 
> deadlock for amd64.

OK, I reproduced it pretty easily by running fsx against mksnap_ffs.

  828 c9525448    0   579   828 0004002 [SLPQ ufs 0xcb758c28][SLP] mksnap_ffs
  820 c96d5224    0   684   820 0004002 [SLPQ suspfs 0xc9035cb8][SLP] fsx

db> show lockedvnods
Locked vnodes

0xcb758bd0: tag ufs, type VREG
    usecount 2, writecount 1, refcount 8 mountedhere 0
    flags ()
    v_object 0xc9d32438 ref 1 pages 26
     lock type ufs: EXCL (count 1) by thread 0xc96d4000 (pid 820) with 1 pending#0 0xc0521640 at lockmgr+0x584
#1 0xc058e493 at vop_stdlock+0x32
#2 0xc06f623e at VOP_LOCK_APV+0xa6
#3 0xc06635f5 at ffs_lock+0x19
#4 0xc06f623e at VOP_LOCK_APV+0xa6
#5 0xc05a6f10 at vn_lock+0xd3
#6 0xc068768c at vm_object_sync+0x16c
#7 0xc068285c at vm_map_sync+0x2a1
#8 0xc0684ba4 at msync+0x7f
#9 0xc06dd45c at syscall+0x2e7
#10 0xc06c755f at Xint0x80_syscall+0x1f

        ino 282625, on dev da0s1e
VNASSERT failed
0xca1b5600: tag (null), type VMARKER
    usecount 0, writecount 0, refcount 0 mountedhere 0
    flags ()
    db>

db> wh 828
Tracing pid 828 tid 100079 td 0xc90d6340
sched_switch(c90d6340,0,1,15c,c078b4f0) at sched_switch+0x19b
mi_switch(1,0,c0722811,1c0,2) at mi_switch+0x2d0
sleepq_switch(cb758c28,0,c0722811,212,ee12e3c4) at sleepq_switch+0x10d
sleepq_wait(cb758c28,0,c072013c,c8,0) at sleepq_wait+0x66
msleep(cb758c28,c0786b28,50,c07274f4,0) at msleep+0x302
acquire(ee12e470,60,60000,b2,c90d6340) at acquire+0x88
lockmgr(cb758c28,2022,cb758c98,c90d6340,ee12e49c) at lockmgr+0x4dc
vop_stdlock(ee12e4f0,cb758c98,9,c07717e0,ee12e4f0) at vop_stdlock+0x32
VOP_LOCK_APV(c0771d20,ee12e4f0,ee12e4c8,c06f623e,ee12e4f0) at VOP_LOCK_APV+0xa6
ffs_lock(ee12e4f0,c078d0b4,ee12e4d8,2022,cb758bd0) at ffs_lock+0x19
VOP_LOCK_APV(c07717e0,ee12e4f0,c0734ad5,ee12e4f4,c055b153) at VOP_LOCK_APV+0xa6
vn_lock(cb758bd0,2022,c90d6340,c0734acc,2022) at vn_lock+0xd3
vget(cb758bd0,2022,c90d6340,482,c9035c88) at vget+0xf0
ffs_sync(c9035c00,1,c90d6340,c072a290,0) at ffs_sync+0x192
vfs_write_suspend(c9035c00,ee12e7d0,df59d408,1,c8bc9880) at vfs_write_suspend+0xd5
ffs_snapshot(c9035c00,cd07a480,ee12e944,6c,c106c988) at ffs_snapshot+0xd36
ffs_mount(c9035c00,c90d6340,c0728f74,33f,246) at ffs_mount+0xd21
vfs_domount(c90d6340,ca133a80,ccdda980,1211000,cc6c8e80) at vfs_domount+0x644
vfs_donmount(c90d6340,1211000,ee12eb9c,cce83880,e) at vfs_donmount+0x51f
kernel_mount(cc0bcd80,1211000,ee12ebe0,6c,bfbfe98e) at kernel_mount+0x7d
ffs_cmount(cc0bcd80,bfbfe1f0,1211000,c90d6340,c07714c0) at ffs_cmount+0x85
mount(c90d6340,ee12ed04,c0740342,3ed,c97ad630) at mount+0x1d7
syscall(3b,3b,3b,28152dd8,bfbfe190) at syscall+0x2e7
Xint0x80_syscall() at Xint0x80_syscall+0x1f
--- syscall (21, FreeBSD ELF32, mount), eip = 0x280c068f, esp = 0xbfbfe15c, ebp = 0xbfbfe858 ---
db> wh 820
Tracing pid 820 tid 100106 td 0xc96d4000
sched_switch(c96d4000,0,1,15c,c078bdb0) at sched_switch+0x19b
mi_switch(1,0,c0722811,1c0,0) at mi_switch+0x2d0
sleepq_switch(c9035cb8,0,c0722811,212,ee1fa940) at sleepq_switch+0x10d
sleepq_wait(c9035cb8,0,c072013c,c8,0) at sleepq_wait+0x66
msleep(c9035cb8,c9035c88,9f,c072a26e,0) at msleep+0x302
vn_start_write(cb758bd0,ee1fa990,1,3f9,c9035c00) at vn_start_write+0xe0
vnode_pager_putpages(c9d32438,ee1faa60,f,5,ee1fa9e0) at vnode_pager_putpages+0x77
vm_pageout_flush(ee1faa60,f,5,375,c071d6bf) at vm_pageout_flush+0x16d
vm_object_page_collect_flush(c9d32438,c11f5338,139a0,5,10) at vm_object_page_collect_flush+0x2f9
vm_object_page_clean(c9d32438,7,0,16,0) at vm_object_page_clean+0x1b7
vm_object_sync(c9d32438,7000,0,f000,1) at vm_object_sync+0x1e9
vm_map_sync(c9709420,2816f000,2817e000,1,0) at vm_map_sync+0x2a1
msync(c96d4000,ee1fad04,c,c96d4000,c9709420) at msync+0x7f
syscall(3b,3b,bfbf003b,ddc3,73b5) at syscall+0x2e7
Xint0x80_syscall() at Xint0x80_syscall+0x1f
--- syscall (65, FreeBSD ELF32, msync), eip = 0x280cd4cf, esp = 0xbfbfdf6c, ebp = 0xbfbfdfc8 ---
db> 

I have also seen a deadlock via msync() without snapshots in use (I
have a core from that too), which fits with your observation.

Kris
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 187 bytes
Desc: not available
Url : http://lists.freebsd.org/pipermail/freebsd-stable/attachments/20060223/7834e391/attachment.bin


More information about the freebsd-stable mailing list