Linker deadlock.

Kostik Belousov kostikbel at gmail.com
Sun Aug 3 11:30:08 UTC 2008


On Sun, Aug 03, 2008 at 09:32:40AM +0200, Pawel Jakub Dawidek wrote:
> Hi.
> 
> Linker can easly deadlock when we try to load the same kernel module
> from two processes at the same time. This is because we drop kld_sx in
> linker_load_file() and reacquire it, which leads to LOR, because we
> already held vnode lock at this point. Interesing backtraces below.
> 
> First process:
> 
> db> tr 3066
> Tracing pid 3066 tid 100090 td 0x8514b240
> sched_switch(8514b240,0,104,177,bb6bbb2e,...) at sched_switch+0x40e
> mi_switch(104,0,80681605,1ca,0,...) at mi_switch+0x200
> sleepq_switch(8514b240,0,80681605,237,80a281ec,...) at sleepq_switch+0x14d
> sleepq_wait(80a281ec,0,8067a18b,3,0,...) at sleepq_wait+0x63
> _sx_xlock_hard(80a281ec,8514b240,0,8067a1cf,1a0,...) at _sx_xlock_hard+0x2c6
> _sx_xlock(80a281ec,0,8067a1cf,1a0,0,...) at _sx_xlock+0x99
> linker_load_module(853a1264,0,83ba8940,83ba893c,83ba8938,...) at linker_load_module+0xa4a
> linker_load_dependencies(84fb8500,bb74,8539f000,2adc,156000,...) at linker_load_dependencies+0x194
> link_elf_load_file(806b74e0,8557e4c0,83ba8c24,17c,0,...) at link_elf_load_file+0x4f0
> linker_load_module(0,83ba8c4c,8067a1cf,3cd,280cb730,...) at linker_load_module+0x8db
> kern_kldload(8514b240,8592d400,83ba8c70,0,b395eb11,...) at kern_kldload+0xc8
> [...]
> db> show lock 0x80a281ec
>  class: sx
>  name: kernel linker
>  state: XLOCK: 0x8514bd80 (tid 100117, pid 3065, "zpool")
>  waiters: exclusive
> 
> Second process:
> 
> db> tr 3065
> Tracing pid 3065 tid 100117 td 0x8514bd80
> sched_switch(8514bd80,0,104,177,bb7e358b,...) at sched_switch+0x40e
> mi_switch(104,0,80681605,1ca,50,...) at mi_switch+0x200
> sleepq_switch(8514bd80,0,80681605,237,8523d9c0,...) at sleepq_switch+0x14d
> sleepq_wait(8523d9c0,50,806906bb,4,0,...) at sleepq_wait+0x63
> __lockmgr_args(8523d9c0,80100,8523da28,0,0,...) at __lockmgr_args+0x9a5
> vop_stdlock(83bd2660,8508aa80,2,80100,8523d968,...) at vop_stdlock+0x65
> VOP_LOCK1_APV(806c3560,83bd2660,806d2ac0,8523d968,80100,...) at VOP_LOCK1_APV+0xa5
> _vn_lock(8523d968,80100,8068815b,802,804c9cb4,...) at _vn_lock+0x5e
> vget(8523d968,80100,8514bd80,1b7,8065d00f,...) at vget+0xc9
> cache_lookup(85090158,83bd2a00,83bd2a14,0,84f3b400,...) at cache_lookup+0x4c2
> nfs_lookup(83bd2838,80688e43,806d2720,80000,85090158,...) at nfs_lookup+0x101
> VOP_LOOKUP_APV(806c3560,83bd2838,8068783d,1bd,83bd2a00,...) at VOP_LOOKUP_APV+0xe5
> lookup(83bd29e8,8068783d,e0,c0,8506e52c,...) at lookup+0x52e
> namei(83bd29e8,81159a38,80a352b4,4,8067be1f,...) at namei+0x48b
> vn_open_cred(83bd29e8,83bd2a4c,0,84f3b400,0,...) at vn_open_cred+0x2ba
> vn_open(83bd29e8,83bd2a4c,0,0,806b2a00,...) at vn_open+0x33
> linker_lookup_file(3,0,3,8514bd80,0,...) at linker_lookup_file+0x163
> linker_load_module(0,83bd2c4c,8067a1cf,3cd,280cb730,...) at linker_load_module+0x7bd
> kern_kldload(8514bd80,85a7e400,83bd2c70,0,b395eb11,...) at kern_kldload+0xc8
> [...]
> db> show vnode 0x8523d968
> vnode 0x8523d968: tag nfs, type VREG
>     usecount 1, writecount 0, refcount 189 mountedhere 0
>     flags ()
>     v_object 0x852489b0 ref 0 pages 372
>      lock type nfs: EXCL by thread 0x8514b240 (pid 3066)
>  with exclusive waiters pending
> #0 0x804c2e5d at __lockmgr_args+0xa6d
> #1 0x80546c85 at vop_stdlock+0x65
> #2 0x8065dcd5 at VOP_LOCK1_APV+0xa5
> #3 0x805627ee at _vn_lock+0x5e
> #4 0x80557419 at vget+0xc9
> #5 0x805444b2 at cache_lookup+0x4c2
> #6 0x805c3b51 at nfs_lookup+0x101
> #7 0x8065ee65 at VOP_LOOKUP_APV+0xe5
> #8 0x8054a9be at lookup+0x52e
> #9 0x8054b5eb at namei+0x48b
> #10 0x805621da at vn_open_cred+0x2ba
> #11 0x80562463 at vn_open+0x33
> #12 0x804f45e8 at link_elf_load_file+0x68
> #13 0x804c0f9b at linker_load_module+0x8db
> #14 0x804c1568 at kern_kldload+0xc8
> #15 0x804c1624 at kldload+0x74
> #16 0x80650513 at syscall+0x283
> #17 0x80634e40 at Xint0x80_syscall+0x20
> [...]

Source line backtraces would be nicer, since gcc inliner forces me to make
a guess. It seems that linker_load_module() calls linker_load_file()
that drops and reaquires the linker lock.

Then, it seems that dropping the module' vnode lock around the call to
linker_load_dependencies() should help.

diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c
index 2664ba9..52b3f8f 100644
--- a/sys/kern/link_elf.c
+++ b/sys/kern/link_elf.c
@@ -802,7 +802,9 @@ link_elf_load_file(linker_class_t cls, const char* filename,
 	goto out;
     link_elf_reloc_local(lf);
 
+    VOP_UNLOCK(nd.ni_vp, 0);
     error = linker_load_dependencies(lf);
+    vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
     if (error)
 	goto out;
 #if 0	/* this will be more trouble than it's worth for now */
diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c
index d8e9219..657dd0e 100644
--- a/sys/kern/link_elf_obj.c
+++ b/sys/kern/link_elf_obj.c
@@ -798,7 +798,9 @@ link_elf_load_file(linker_class_t cls, const char *filename,
 	link_elf_reloc_local(lf);
 
 	/* Pull in dependencies */
+	VOP_UNLOCK(nd.ni_vp);
 	error = linker_load_dependencies(lf);
+	vn_lock(nd.ni_vp, LK_EXCLUSIVE | LK_RETRY);
 	if (error)
 		goto out;
 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 195 bytes
Desc: not available
Url : http://lists.freebsd.org/pipermail/freebsd-hackers/attachments/20080803/5b7e8db9/attachment.pgp


More information about the freebsd-hackers mailing list