lock up in 6.2 (procs massively stuck in Giant)
pluknet
pluknet at gmail.com
Tue Jun 23 12:24:05 UTC 2009
2009/5/13 John Baldwin <jhb at freebsd.org>:
> On Wednesday 13 May 2009 11:41:22 am pluknet wrote:
>> 2009/5/13 John Baldwin <jhb at freebsd.org>:
>> > On Wednesday 13 May 2009 2:40:33 am pluknet wrote:
>> >> 2009/5/13 pluknet <pluknet at gmail.com>:
>> >> > 2009/5/13 John Baldwin <jhb at freebsd.org>:
>> >> >> On Tuesday 12 May 2009 4:59:19 pm pluknet wrote:
>> >> >>> Hi.
>> >> >>>
>> >> >>> From just another box (not from the first two mentioned earlier)
>> >> >>> with a similar locking issue. If it would make sense, since there are
>> >> >>> possibly a bit different conditions.
>> >> >>> clock proc here is on swi4, I hope it's a non-important difference.
>> >> >>>
>> >> >>> 18 0 0 0 LL *Giant 0xd0a6b140 [swi4: clock
> sio]
>> >> >>> db> bt 18
>> >> >>
>> >> >> Ok, this is a known issue in 6.x. It is fixed in 6.4.
>> >> >>
>> >>
>> >> Looking at the face of kern_timeout.c I suspect that was fixed in
> r181012.
>> >
>> > No, this particular issue is fixed by a change to sched_4bsd.c in r179975.
>> >
>>
>> Gah.. We constrained to use ule scheduler on 6.x (yes, I know that
>> "it's known to be broken (c)"), since we have had a very bad interactivity
>> on 4bsd on our workload. Ok, that's just another reason to move to 7.x.
>
> Hmmm I would have thought ULE wouldn't have suffered from this bug. The
> problem on 4BSD was if softclock ever blocked on Giant and the thread that
> held Giant was on a run queue and pinned to a specific CPU but that another
> userland thread was running on that CPU already, the userland thread would
> never yield the CPU so long as it kept busy since the round robin timeout
> would never run.
>
> --
> John Baldwin
>
That's another sort of lockup on 6.2 we experience often.
May that be connected to ULE on 6.x?
I regret if this info is not enough.
db> ps
pid ppid pgrp uid state wmesg wchan cmd
74606 74602 68315 0 R stat
74605 74601 68315 0 S piperd 0xcd7b0198 head
74603 74601 68315 0 S piperd 0xc8ca2198 sort
74602 74601 68315 0 S wait 0xcaed6000 find
74601 68319 68315 0 S wait 0xd0f5e860 sh
74588 7495 7495 13581 S lockf 0xd1919dc0 httpd
74587 7495 7495 13581 S lockf 0xce42b400 httpd
74586 8016 8016 7336 R httpd
74585 8016 8016 7336 R httpd
74584 9498 9498 26316 R httpd
74341 3399 8150 13289 R CPU 7 perl5.8.8
74020 7495 7495 13581 S lockf 0xccf31180 httpd
74019 8247 8247 26256 R httpd
74018 8016 8016 7336 R CPU 4 httpd
72732 9190 9190 26291 RL CPU 1 httpd
72731 9190 9190 26291 S accept 0xcd31572e httpd
72729 8693 8693 26404 R httpd
72727 9190 9190 26291 S accept 0xcd31572e httpd
72726 9396 9396 26262 R httpd
72088 7495 7495 13581 S kqread 0xcb2f9400 httpd
72087 9190 9190 26291 S accept 0xcd31572e httpd
72085 9190 9190 26291 S accept 0xcd31572e httpd
72084 8162 8162 18538 R httpd
71402 7495 7495 13581 S lockf 0xccfab3c0 httpd
71401 8162 8162 18538 R httpd
71400 9190 9190 26291 S accept 0xcd31572e httpd
71399 8716 8716 26278 R CPU 3 httpd
70063 7574 7574 11303 S lockf 0xccf312c0 httpd
69417 8371 8371 25968 R httpd
69416 9030 9030 39658 R httpd
68319 68318 68315 0 S piperd 0xd1b9f198 sh
68318 68315 68315 0 S wait 0xc82f7648 lockf
68315 68313 68315 0 Ss wait 0xca914430 sh
68313 34501 34501 0 S piperd 0xcfbef000 cron
68310 8016 8016 7336 R httpd
68309 64318 64318 14620 R httpd
68308 9111 9111 26280 S lockf 0xca51cc00 httpd
68302 8595 8595 26129 RL httpd
68301 9190 9190 26291 S accept 0xcd31572e httpd
68300 8483 8483 26049 R httpd
68296 8747 8747 33525 R httpd
68287 8952 8952 26340 R httpd
68282 9110 9110 26102 R httpd
68280 9110 9110 26102 R httpd
68272 8339 8339 17137 S accept 0xcc5159f6 httpd
68271 8595 8595 26129 R httpd
68269 9470 9470 26006 R httpd
68268 9030 9030 39658 S sbwait 0xc89d0da4 httpd
68251 36391 36391 38054 R httpd
68249 7527 7527 16760 R httpd
68247 9030 9030 39658 R httpd
68245 8901 8901 26031 S accept 0xcd3159f6 httpd
68239 8928 8928 26128 R httpd
68238 8928 8928 26128 S lockf 0xd1659c40 httpd
68214 7619 7619 6478 S accept 0xcb25219e httpd
68210 8675 8675 26171 S accept 0xc8ca719e httpd
68205 8388 8388 27687 R httpd
68199 8339 8339 17137 R httpd
68197 9169 9169 26030 S accept 0xcaa7a03a httpd
[...]
db> sh allpcpu
Current CPU: 1
cpuid = 0
curthread = 0xc7cfae10: pid 19 "swi4: clock sio"
curpcb = 0xe6895d90
fpcurthread = none
idlethread = 0xc7cfaaf0: pid 17 "idle: cpu0"
APIC ID = 0
currentldt = 0x50
cpuid = 1
curthread = 0xcc20c000: pid 72732 "httpd"
curpcb = 0xefcaad90
fpcurthread = none
idlethread = 0xc7cfa000: pid 16 "idle: cpu1"
APIC ID = 1
currentldt = 0x50
cpuid = 2
curthread = 0xc8413e10: pid 35417 "proftpd"
curpcb = 0xf0e55d90
fpcurthread = none
idlethread = 0xc7cf9e10: pid 15 "idle: cpu2"
APIC ID = 2
currentldt = 0x50
cpuid = 3
curthread = 0xcfd37320: pid 71399 "httpd"
curpcb = 0xf0786d90
fpcurthread = none
idlethread = 0xc7cf9c80: pid 14 "idle: cpu3"
APIC ID = 3
currentldt = 0x50
cpuid = 4
curthread = 0xd0f68320: pid 74018 "httpd"
curpcb = 0xf0a79d90
fpcurthread = none
idlethread = 0xc7cf9af0: pid 13 "idle: cpu4"
APIC ID = 4
currentldt = 0x50
cpuid = 5
curthread = 0xcb92a640: pid 56782 "httpd"
curpcb = 0xef96bd90
fpcurthread = none
idlethread = 0xc7cf9960: pid 12 "idle: cpu5"
APIC ID = 5
currentldt = 0x50
cpuid = 6
curthread = 0xd1e98640: pid 67921 "httpd"
curpcb = 0xf1545d90
fpcurthread = none
idlethread = 0xc7cf97d0: pid 11 "idle: cpu6"
APIC ID = 6
currentldt = 0x50
cpuid = 7
curthread = 0xca523e10: pid 74341 "perl5.8.8"
curpcb = 0xeee48d90
fpcurthread = none
idlethread = 0xc7cf9640: pid 10 "idle: cpu7"
APIC ID = 7
currentldt = 0x50
db> bt 74606
Tracing pid 74606 tid 100065 td 0xc8138190
sched_switch(c8138190,0,2) at sched_switch+0x143
mi_switch(2,0) at mi_switch+0x1ba
critical_exit(c0a06c60,ee77e9d4,c0899240,0,28060008,...) at critical_exit+0x9d
lapic_handle_timer(0) at lapic_handle_timer+0xc9
Xtimerint(c0a06c60,c8138190,0,0,0) at Xtimerint+0x30
namei(ee77ebcc) at namei+0x1ec
vn_open_cred(ee77ebcc,ee77eccc,1a4,d06a0b00,5,...) at vn_open_cred+0x2ad
vn_open(ee77ebcc,ee77eccc,1a4,5) at vn_open+0x1e
kern_open(c8138190,2806ac48,0,1,1b6,...) at kern_open+0xb6
open(c8138190,ee77ed04) at open+0x1a
syscall(3b,3b,3b,4,28070c80,...) at syscall+0x2bf
Xint0x80_syscall() at Xint0x80_syscall+0x1f
--- syscall (5, FreeBSD ELF32, open), eip = 0x280555af, esp =
0xbfbfebfc, ebp = 0xbfbfec28 ---
db> bt 74341
Tracing pid 74341 tid 101681 td 0xca523e10
sched_switch(7,7,1,eee48a0c,c08a55d9,...) at sched_switch+0x143
MAXCPU(e5895590,62e85356,e8000110,ffffa3d5,ffb988e8,...) at 0xf7
*** error reading from address f8658d94 ***
db> bt 67921
Tracing pid 67921 tid 102585 td 0xd1e98640
sched_switch(d1e98640,c7cfa960,6) at sched_switch+0x143
mi_switch(6,c7cfa960,c7cfaab8,c0a0b460,f154592c,...) at mi_switch+0x1ba
maybe_preempt(c7cfa960) at maybe_preempt+0xc4
sched_add(c7cfa960,4,d1e98640,c7cfa960,f1545950,...) at sched_add+0x258
setrunqueue(f1545968,c066d0fc,0,c7cef69c,0,...) at setrunqueue+0x63
_end() at 0xd1e98640
db> show lockchain 67921
thread 102585 (pid 67921, httpd) running on CPU 6
db> show lock Giant
class: sleep mutex
name: Giant
flags: {DEF, RECURSE}
state: {OWNED, CONTESTED}
owner: 0xd0f68320 (tid 102209, pid 74018, "httpd")
db> bt 74018
Tracing pid 74018 tid 102209 td 0xd0f68320
sched_switch(c0a0a0b0,cd635c48,c9018f68,c0a0baf0,1,...) at sched_switch+0x143
MAXCPU(2004,cd635af0,0,16,1,...) at 0x7
_end() at 0xcb62b608
db> bt 71399
Tracing pid 71399 tid 101992 td 0xcfd37320
sched_switch(5,5,1,f0786a0c,c08a55d9,...) at sched_switch+0x143
MAXCPU(e5895590,62e85356,e8000110,ffffa3d5,ffb988e8,...) at 0xf7
*** error reading from address f8658d94 ***
db> bt 35417
Tracing pid 35417 tid 102675 td 0xc8413e10
sched_switch(7,7,1,f0e55ad8,c08a55d9,...) at sched_switch+0x143
MAXCPU(e5895590,62e85356,e8000110,ffffa3d5,ffb988e8,...) at 0xf7
*** error reading from address f8658d94 ***
db> show proc 74018
Process 74018 (httpd) at 0xd0f73218:
state: NORMAL
uid: 7336 gids: 999, 999, 7336
parent: pid 8016 at 0xcaf69648
ABI: FreeBSD ELF32
arguments: /home/xxx/etc/apache/bin/httpd
threads: 1
102209 Run CPU 4 httpd
db> show lockchain 74018
thread 102209 (pid 74018, httpd) running on CPU 4
db> show turnstile
db> show lockedbufs
buf at 0xdbda0400
b_flags = 0x20000000<vmio>
b_error = 0, b_bufsize = 16384, b_bcount = 16384, b_resid = 0
b_bufobj = (0xc8160d80), b_data = 0xdcdb1000, b_blkno = 920752960
b_npages = 4, pages(OBJ, IDX, PA): (0xc8163084, 0x6dc3268,
0x60fd4000),(0xc8163084, 0x6dc3269, 0x58495000),(0xc8163084,
0x6dc326a, 0x1b8b6000),(0xc8163084, 0x6dc326b, 0x74f97000)
db> show lockedvnods
Locked vnodes
db> show sleepq
db> show lockmgr
db> show sleepchain
thread 101368 (pid 72732, httpd) running on CPU 1
db> bt 72732
Tracing pid 72732 tid 101368 td 0xcc20c000
kdb_enter(c094016e) at kdb_enter+0x2b
siointr1(c7ee6c00) at siointr1+0xce
siointr(c7ee6c00) at siointr+0x5e
intr_execute_handlers(c7cef4c8,efcaab98,4,efcaabe8,c0899013,...) at
intr_execute_handlers+0xe1
lapic_handle_intr(37) at lapic_handle_intr+0x2e
Xapic_isr1() at Xapic_isr1+0x33
--- interrupt, eip = 0xc067a3be, esp = 0xefcaabdc, ebp = 0xefcaabe8 ---
_mtx_lock_sleep(c0a06c60,cc20c000,0,0,0) at _mtx_lock_sleep+0xb6
vm_fault(ca2a1818,28747000,1,0) at vm_fault+0x1a5
trap_pfault(efcaad38,1,28747584,28747584,0,...) at trap_pfault+0x123
trap(280b003b,bfbf003b,bfbf003b,280b79c8,bfbfed84,...) at trap+0x1eb
calltrap() at calltrap+0x5
--- trap 0xc, eip = 0x28747584, esp = 0xbfbfeaec, ebp = 0xbfbfeb08 ---
db> bt
Tracing pid 72732 tid 101368 td 0xcc20c000
kdb_enter(c094016e) at kdb_enter+0x2b
siointr1(c7ee6c00) at siointr1+0xce
siointr(c7ee6c00) at siointr+0x5e
intr_execute_handlers(c7cef4c8,efcaab98,4,efcaabe8,c0899013,...) at
intr_execute_handlers+0xe1
lapic_handle_intr(37) at lapic_handle_intr+0x2e
Xapic_isr1() at Xapic_isr1+0x33
--- interrupt, eip = 0xc067a3be, esp = 0xefcaabdc, ebp = 0xefcaabe8 ---
_mtx_lock_sleep(c0a06c60,cc20c000,0,0,0) at _mtx_lock_sleep+0xb6
vm_fault(ca2a1818,28747000,1,0) at vm_fault+0x1a5
trap_pfault(efcaad38,1,28747584,28747584,0,...) at trap_pfault+0x123
trap(280b003b,bfbf003b,bfbf003b,280b79c8,bfbfed84,...) at trap+0x1eb
calltrap() at calltrap+0x5
--- trap 0xc, eip = 0x28747584, esp = 0xbfbfeaec, ebp = 0xbfbfeb08 ---
--
wbr,
pluknet
More information about the freebsd-stable
mailing list