lock up in 6.2 (procs massively stuck in Giant)

pluknet pluknet at gmail.com
Tue Jun 23 12:24:05 UTC 2009


2009/5/13 John Baldwin <jhb at freebsd.org>:
> On Wednesday 13 May 2009 11:41:22 am pluknet wrote:
>> 2009/5/13 John Baldwin <jhb at freebsd.org>:
>> > On Wednesday 13 May 2009 2:40:33 am pluknet wrote:
>> >> 2009/5/13 pluknet <pluknet at gmail.com>:
>> >> > 2009/5/13 John Baldwin <jhb at freebsd.org>:
>> >> >> On Tuesday 12 May 2009 4:59:19 pm pluknet wrote:
>> >> >>> Hi.
>> >> >>>
>> >> >>> From just another box (not from the first two mentioned earlier)
>> >> >>> with a similar locking issue. If it would make sense, since there are
>> >> >>> possibly a bit different conditions.
>> >> >>> clock proc here is on swi4, I hope it's a non-important difference.
>> >> >>>
>> >> >>>    18     0     0     0  LL     *Giant    0xd0a6b140 [swi4: clock
> sio]
>> >> >>> db> bt 18
>> >> >>
>> >> >> Ok, this is a known issue in 6.x.  It is fixed in 6.4.
>> >> >>
>> >>
>> >> Looking at the face of kern_timeout.c I suspect that was fixed in
> r181012.
>> >
>> > No, this particular issue is fixed by a change to sched_4bsd.c in r179975.
>> >
>>
>> Gah.. We constrained to use ule scheduler on 6.x (yes, I know that
>> "it's known to be broken (c)"), since we have had a very bad interactivity
>> on 4bsd on our workload. Ok, that's just another reason to move to 7.x.
>
> Hmmm I would have thought ULE wouldn't have suffered from this bug.  The
> problem on 4BSD was if softclock ever blocked on Giant and the thread that
> held Giant was on a run queue and pinned to a specific CPU but that another
> userland thread was running on that CPU already, the userland thread would
> never yield the CPU so long as it kept busy since the round robin timeout
> would never run.
>
> --
> John Baldwin
>

That's another sort of lockup on 6.2 we experience often.
May that be connected to ULE on 6.x?
I regret if this info is not enough.

db> ps
  pid  ppid  pgrp   uid   state   wmesg     wchan    cmd
74606 74602 68315     0  R                           stat
74605 74601 68315     0  S       piperd   0xcd7b0198 head
74603 74601 68315     0  S       piperd   0xc8ca2198 sort
74602 74601 68315     0  S       wait     0xcaed6000 find
74601 68319 68315     0  S       wait     0xd0f5e860 sh
74588  7495  7495 13581  S       lockf    0xd1919dc0 httpd
74587  7495  7495 13581  S       lockf    0xce42b400 httpd
74586  8016  8016  7336  R                           httpd
74585  8016  8016  7336  R                           httpd
74584  9498  9498 26316  R                           httpd
74341  3399  8150 13289  R       CPU 7               perl5.8.8
74020  7495  7495 13581  S       lockf    0xccf31180 httpd
74019  8247  8247 26256  R                           httpd
74018  8016  8016  7336  R       CPU 4               httpd
72732  9190  9190 26291  RL      CPU 1               httpd
72731  9190  9190 26291  S       accept   0xcd31572e httpd
72729  8693  8693 26404  R                           httpd
72727  9190  9190 26291  S       accept   0xcd31572e httpd
72726  9396  9396 26262  R                           httpd
72088  7495  7495 13581  S       kqread   0xcb2f9400 httpd
72087  9190  9190 26291  S       accept   0xcd31572e httpd
72085  9190  9190 26291  S       accept   0xcd31572e httpd
72084  8162  8162 18538  R                           httpd
71402  7495  7495 13581  S       lockf    0xccfab3c0 httpd
71401  8162  8162 18538  R                           httpd
71400  9190  9190 26291  S       accept   0xcd31572e httpd
71399  8716  8716 26278  R       CPU 3               httpd
70063  7574  7574 11303  S       lockf    0xccf312c0 httpd
69417  8371  8371 25968  R                           httpd
69416  9030  9030 39658  R                           httpd
68319 68318 68315     0  S       piperd   0xd1b9f198 sh
68318 68315 68315     0  S       wait     0xc82f7648 lockf
68315 68313 68315     0  Ss      wait     0xca914430 sh
68313 34501 34501     0  S       piperd   0xcfbef000 cron
68310  8016  8016  7336  R                           httpd
68309 64318 64318 14620  R                           httpd
68308  9111  9111 26280  S       lockf    0xca51cc00 httpd
68302  8595  8595 26129  RL                          httpd
68301  9190  9190 26291  S       accept   0xcd31572e httpd
68300  8483  8483 26049  R                           httpd
68296  8747  8747 33525  R                           httpd
68287  8952  8952 26340  R                           httpd
68282  9110  9110 26102  R                           httpd
68280  9110  9110 26102  R                           httpd
68272  8339  8339 17137  S       accept   0xcc5159f6 httpd
68271  8595  8595 26129  R                           httpd
68269  9470  9470 26006  R                           httpd
68268  9030  9030 39658  S       sbwait   0xc89d0da4 httpd
68251 36391 36391 38054  R                           httpd
68249  7527  7527 16760  R                           httpd
68247  9030  9030 39658  R                           httpd
68245  8901  8901 26031  S       accept   0xcd3159f6 httpd
68239  8928  8928 26128  R                           httpd
68238  8928  8928 26128  S       lockf    0xd1659c40 httpd
68214  7619  7619  6478  S       accept   0xcb25219e httpd
68210  8675  8675 26171  S       accept   0xc8ca719e httpd
68205  8388  8388 27687  R                           httpd
68199  8339  8339 17137  R                           httpd
68197  9169  9169 26030  S       accept   0xcaa7a03a httpd
[...]
db> sh allpcpu
Current CPU: 1

cpuid        = 0
curthread    = 0xc7cfae10: pid 19 "swi4: clock sio"
curpcb       = 0xe6895d90
fpcurthread  = none
idlethread   = 0xc7cfaaf0: pid 17 "idle: cpu0"
APIC ID      = 0
currentldt   = 0x50

cpuid        = 1
curthread    = 0xcc20c000: pid 72732 "httpd"
curpcb       = 0xefcaad90
fpcurthread  = none
idlethread   = 0xc7cfa000: pid 16 "idle: cpu1"
APIC ID      = 1
currentldt   = 0x50

cpuid        = 2
curthread    = 0xc8413e10: pid 35417 "proftpd"
curpcb       = 0xf0e55d90
fpcurthread  = none
idlethread   = 0xc7cf9e10: pid 15 "idle: cpu2"
APIC ID      = 2
currentldt   = 0x50

cpuid        = 3
curthread    = 0xcfd37320: pid 71399 "httpd"
curpcb       = 0xf0786d90
fpcurthread  = none
idlethread   = 0xc7cf9c80: pid 14 "idle: cpu3"
APIC ID      = 3
currentldt   = 0x50

cpuid        = 4
curthread    = 0xd0f68320: pid 74018 "httpd"
curpcb       = 0xf0a79d90
fpcurthread  = none
idlethread   = 0xc7cf9af0: pid 13 "idle: cpu4"
APIC ID      = 4
currentldt   = 0x50
cpuid        = 5
curthread    = 0xcb92a640: pid 56782 "httpd"
curpcb       = 0xef96bd90
fpcurthread  = none
idlethread   = 0xc7cf9960: pid 12 "idle: cpu5"
APIC ID      = 5
currentldt   = 0x50

cpuid        = 6
curthread    = 0xd1e98640: pid 67921 "httpd"
curpcb       = 0xf1545d90
fpcurthread  = none
idlethread   = 0xc7cf97d0: pid 11 "idle: cpu6"
APIC ID      = 6
currentldt   = 0x50

cpuid        = 7
curthread    = 0xca523e10: pid 74341 "perl5.8.8"
curpcb       = 0xeee48d90
fpcurthread  = none
idlethread   = 0xc7cf9640: pid 10 "idle: cpu7"
APIC ID      = 7
currentldt   = 0x50

db> bt 74606
Tracing pid 74606 tid 100065 td 0xc8138190
sched_switch(c8138190,0,2) at sched_switch+0x143
mi_switch(2,0) at mi_switch+0x1ba
critical_exit(c0a06c60,ee77e9d4,c0899240,0,28060008,...) at critical_exit+0x9d
lapic_handle_timer(0) at lapic_handle_timer+0xc9
Xtimerint(c0a06c60,c8138190,0,0,0) at Xtimerint+0x30
namei(ee77ebcc) at namei+0x1ec
vn_open_cred(ee77ebcc,ee77eccc,1a4,d06a0b00,5,...) at vn_open_cred+0x2ad
vn_open(ee77ebcc,ee77eccc,1a4,5) at vn_open+0x1e
kern_open(c8138190,2806ac48,0,1,1b6,...) at kern_open+0xb6
open(c8138190,ee77ed04) at open+0x1a
syscall(3b,3b,3b,4,28070c80,...) at syscall+0x2bf
Xint0x80_syscall() at Xint0x80_syscall+0x1f
--- syscall (5, FreeBSD ELF32, open), eip = 0x280555af, esp =
0xbfbfebfc, ebp = 0xbfbfec28 ---
db> bt 74341
Tracing pid 74341 tid 101681 td 0xca523e10
sched_switch(7,7,1,eee48a0c,c08a55d9,...) at sched_switch+0x143
MAXCPU(e5895590,62e85356,e8000110,ffffa3d5,ffb988e8,...) at 0xf7
*** error reading from address f8658d94 ***
db> bt 67921
Tracing pid 67921 tid 102585 td 0xd1e98640
sched_switch(d1e98640,c7cfa960,6) at sched_switch+0x143
mi_switch(6,c7cfa960,c7cfaab8,c0a0b460,f154592c,...) at mi_switch+0x1ba
maybe_preempt(c7cfa960) at maybe_preempt+0xc4
sched_add(c7cfa960,4,d1e98640,c7cfa960,f1545950,...) at sched_add+0x258
setrunqueue(f1545968,c066d0fc,0,c7cef69c,0,...) at setrunqueue+0x63
_end() at 0xd1e98640
db> show lockchain 67921
thread 102585 (pid 67921, httpd) running on CPU 6
db> show lock Giant
 class: sleep mutex
 name: Giant
 flags: {DEF, RECURSE}
 state: {OWNED, CONTESTED}
 owner: 0xd0f68320 (tid 102209, pid 74018, "httpd")
db> bt 74018
Tracing pid 74018 tid 102209 td 0xd0f68320
sched_switch(c0a0a0b0,cd635c48,c9018f68,c0a0baf0,1,...) at sched_switch+0x143
MAXCPU(2004,cd635af0,0,16,1,...) at 0x7
_end() at 0xcb62b608
db> bt 71399
Tracing pid 71399 tid 101992 td 0xcfd37320
sched_switch(5,5,1,f0786a0c,c08a55d9,...) at sched_switch+0x143
MAXCPU(e5895590,62e85356,e8000110,ffffa3d5,ffb988e8,...) at 0xf7
*** error reading from address f8658d94 ***
db> bt 35417
Tracing pid 35417 tid 102675 td 0xc8413e10
sched_switch(7,7,1,f0e55ad8,c08a55d9,...) at sched_switch+0x143
MAXCPU(e5895590,62e85356,e8000110,ffffa3d5,ffb988e8,...) at 0xf7
*** error reading from address f8658d94 ***
db> show proc 74018
Process 74018 (httpd) at 0xd0f73218:
 state: NORMAL
 uid: 7336  gids: 999, 999, 7336
 parent: pid 8016 at 0xcaf69648
 ABI: FreeBSD ELF32
 arguments: /home/xxx/etc/apache/bin/httpd
 threads: 1
102209                   Run     CPU 4               httpd
db> show lockchain 74018
thread 102209 (pid 74018, httpd) running on CPU 4
db> show turnstile
db> show lockedbufs
buf at 0xdbda0400
b_flags = 0x20000000<vmio>
b_error = 0, b_bufsize = 16384, b_bcount = 16384, b_resid = 0
b_bufobj = (0xc8160d80), b_data = 0xdcdb1000, b_blkno = 920752960
b_npages = 4, pages(OBJ, IDX, PA): (0xc8163084, 0x6dc3268,
0x60fd4000),(0xc8163084, 0x6dc3269, 0x58495000),(0xc8163084,
0x6dc326a, 0x1b8b6000),(0xc8163084, 0x6dc326b, 0x74f97000)
db> show lockedvnods
Locked vnodes

db> show sleepq
db> show lockmgr
db> show sleepchain
thread 101368 (pid 72732, httpd) running on CPU 1
db> bt 72732
Tracing pid 72732 tid 101368 td 0xcc20c000
kdb_enter(c094016e) at kdb_enter+0x2b
siointr1(c7ee6c00) at siointr1+0xce
siointr(c7ee6c00) at siointr+0x5e
intr_execute_handlers(c7cef4c8,efcaab98,4,efcaabe8,c0899013,...) at
intr_execute_handlers+0xe1
lapic_handle_intr(37) at lapic_handle_intr+0x2e
Xapic_isr1() at Xapic_isr1+0x33
--- interrupt, eip = 0xc067a3be, esp = 0xefcaabdc, ebp = 0xefcaabe8 ---
_mtx_lock_sleep(c0a06c60,cc20c000,0,0,0) at _mtx_lock_sleep+0xb6
vm_fault(ca2a1818,28747000,1,0) at vm_fault+0x1a5
trap_pfault(efcaad38,1,28747584,28747584,0,...) at trap_pfault+0x123
trap(280b003b,bfbf003b,bfbf003b,280b79c8,bfbfed84,...) at trap+0x1eb
calltrap() at calltrap+0x5
--- trap 0xc, eip = 0x28747584, esp = 0xbfbfeaec, ebp = 0xbfbfeb08 ---
db> bt
Tracing pid 72732 tid 101368 td 0xcc20c000
kdb_enter(c094016e) at kdb_enter+0x2b
siointr1(c7ee6c00) at siointr1+0xce
siointr(c7ee6c00) at siointr+0x5e
intr_execute_handlers(c7cef4c8,efcaab98,4,efcaabe8,c0899013,...) at
intr_execute_handlers+0xe1
lapic_handle_intr(37) at lapic_handle_intr+0x2e
Xapic_isr1() at Xapic_isr1+0x33
--- interrupt, eip = 0xc067a3be, esp = 0xefcaabdc, ebp = 0xefcaabe8 ---
_mtx_lock_sleep(c0a06c60,cc20c000,0,0,0) at _mtx_lock_sleep+0xb6
vm_fault(ca2a1818,28747000,1,0) at vm_fault+0x1a5
trap_pfault(efcaad38,1,28747584,28747584,0,...) at trap_pfault+0x123
trap(280b003b,bfbf003b,bfbf003b,280b79c8,bfbfed84,...) at trap+0x1eb
calltrap() at calltrap+0x5
--- trap 0xc, eip = 0x28747584, esp = 0xbfbfeaec, ebp = 0xbfbfeb08 ---


-- 
wbr,
pluknet


More information about the freebsd-stable mailing list