lock up in 6.2 (procs massively stuck in Giant)

Tue May 12 20:59:22 UTC 2009

2009/5/12 John Baldwin <jhb at freebsd.org>:
> On Tuesday 12 May 2009 2:12:27 am pluknet wrote:
>> 2009/5/11 John Baldwin <jhb at freebsd.org>:
>> > On Monday 04 May 2009 11:41:35 pm pluknet wrote:
>> >> 2009/5/1 John Baldwin <jhb at freebsd.org>:
>> >> > On Thursday 30 April 2009 2:36:34 am pluknet wrote:
>> >> >> Hi folks.
>> >> >>
>> >> >> Today I got a new locking issue.
>> >> >> This is the first time I got it, and it's merely reproduced.
>> >> >>
>> >> >> The box has lost both remote connection and local access.
>> >> >> No SIGINFO output on the local console even.
>> >> >> Jumping in ddb> shows the next:
>> >> >>
>> >> >> 1) first, this is a 8-way web server. No processes on runqueue except
> one
>> >> > httpd
>> >> >> (i.e. ps shows R in its state):
>> >> >
>> >> > You need to find who owns Giant and what that thread is doing.  You can
>> > try
>> >> > using 'show lock Giant' as well as 'show lockchain 11568'.
>> >> >
>> >>
>> >> Hi, John!
>> >>
>> >> Just reproduced now on another box.
>> >> Hmm.. stack of the process owing Giant looks garbled.
>> >>
>> >> db> show lock Giant
>> >>  class: sleep mutex
>> >>  name: Giant
>> >>  flags: {DEF, RECURSE}
>> >>  state: {OWNED, CONTESTED}
>> >>  owner: 0xd0d79320 (tid 102754, pid 34594, "httpd")
>> >>
>> >> db> show lockchain 34594
>> >> thread 102754 (pid 34594, httpd) running on CPU 7
>> >> db> show lockchain 102754
>> >> thread 102754 (pid 34594, httpd) running on CPU 7
>> >
>> > The thread is running, so we don't know what it's top of stack is and you
>> > can't a good stack trace in that case.
>> >
>> > None of your CPUs are idle, so I don't think you have any sort of
> deadlock.
>> > You might have a livelock.
>> >
>> > --
>> > John Baldwin
>> >
>>
>> I'm curious if it could be caused by heavy load.
>> I don't know what it might be definitely,
>> as it's non-trivial for me to determine the reason
>> of a livelock, and to debug it.
>>
>> So I think it may have sense to try 7.x, as there
>> has been done much locking work.
>
> It may be worth trying 7.  Also, what is the state of the 'swi7: clock'
> process?
>
> --
> John Baldwin
>

Hi.

>From just another box (not from the first two mentioned earlier)
with a similar locking issue. If it would make sense, since there are
possibly a bit different conditions.
clock proc here is on swi4, I hope it's a non-important difference.

   18     0     0     0  LL     *Giant    0xd0a6b140 [swi4: clock sio]
db> bt 18
Tracing pid 18 tid 100015 td 0xc7cfec80
sched_switch(c7cfec80,0,1) at sched_switch+0x143
mi_switch(1,0) at mi_switch+0x1ba
turnstile_wait(c0a06c60,cb77ee10) at turnstile_wait+0x2f7
_mtx_lock_sleep(c0a06c60,c7cfec80,0,0,0) at _mtx_lock_sleep+0xfc
softclock(0) at softclock+0x231
ithread_execute_handlers(c7d07218,c7d4a100) at ithread_execute_handlers+0x125
ithread_loop(c7cb69f0,e6892d38) at ithread_loop+0x55
fork_exit(c066d3e4,c7cb69f0,e6892d38) at fork_exit+0x71
fork_trampoline() at fork_trampoline+0x8
--- trap 0x1, eip = 0, esp = 0xe6892d6c, ebp = 0 ---

db> show lock Giant
 class: sleep mutex
 name: Giant
 flags: {DEF, RECURSE}
 state: {OWNED, CONTESTED}
 owner: 0xcb77ee10 (tid 101174, pid 8611, "httpd")
db> show lockchain 101174
thread 101174 (pid 8611, httpd) running on CPU 4
db> bt 101174
Tracing pid 8611 tid 101174 td 0xcb77ee10
sched_switch(cb77ee10,c7f3de10,6) at sched_switch+0x143
mi_switch(ca6d82e8,6,c0a0baf0,ca6d82e8,c0a0a0b0,...) at mi_switch
kseq_move(c0a0baf0,6) at kseq_move+0xc1
sched_balance_pair(ef879bb0,ef879bb0,c08a2adf,cb77ef68,cb77b360,.
         lance_pair+0x91
sched_lock(0,cbd1f658,0,cb77b36c,0,...) at sched_lock
_end(cb77b360,cb77b364,cb77ee10,cb77ee18,0,...) at 0xcb77b360
_end(d0a49a80,d0a49a84,c84cf7d0,c84cf7d8,0,...) at 0xc7f97648
_end(ca6dbcc0,ca6dbcc4,ca6d54b0,ca6d54b8,0,...) at 0xcbd1f648
_end(cbcad780,cbcad784,cc8a2190,cc8a2198,0,...) at 0xc8514430
_end(cab883c0,cab883c4,ca9417d0,ca9417d8,0,...) at 0xca6dc000
_end(cc67c4e0,cc67c4e4,cd6fd000,cd6fd008,0,...) at 0xcc8abc90
_end(cd3a9120,cd3a9124,cd3b1320,cd3b1328,0,...) at 0xcad68218
_end(cd130c60,cd130c64,d00ca320,d00ca328,0,...) at 0xca71e860
_end(cbcac240,cbcac244,cbf6e4b0,cbf6e4b8,0,...) at 0xcd472a78
_end(cb73c960,cb73c964,cb4f44b0,cb4f44b8,0,...) at 0xd00cfa78
_end(ca348b40,ca348b44,ca420af0,ca420af8,0,...) at 0xcc0e9c90
_end(d0310ea0,d0310ea4,cd3ad4b0,cd3ad4b8,0,...) at 0xcc7ec218
_end(ca5ddd20,ca5ddd24,ca6d8c80,ca6d8c88,0,...) at 0xca426c90
_end(c998aa20,c998aa24,ca2bb320,ca2bb328,0,...) at 0xd030fc90
[...] oh, i saw that earlier somewhere.. don't remember where.

db> c
and waiting some moments shows a little different picture:
db> bt 101174
Tracing pid 8611 tid 101174 td 0xcb77ee10
sched_switch(cb77ee10,c7f3de10,6) at sched_switch+0x143
mi_switch(cf177608,7,c0a0b460,cf177608,c0a0a0b0,...) at mi_switch+0x1ba
kseq_move(c0a0b460,7) at kseq_move+0xc1
sched_balance_pair(cb77ef68,ef879bb8,c0694edf,cb77ef68,cb77b360,...)
at sched_balance_pair+0x91
_end(cbd1f650,cb77ee10,cb77ee20,0,cb77b374,...) at 0xcb77b360
MAXCPU(cb77b360,cb77b364,cb77ee10,cb77ee18,0,...) at 0
_end(d0a49a80,d0a49a84,c84cf7d0,c84cf7d8,0,...) at 0xc7f97648
_end(ca6dbcc0,ca6dbcc4,ca6d54b0,ca6d54b8,0,...) at 0xcbd1f648
_end(cbcad780,cbcad784,cc8a2190,cc8a2198,0,...) at 0xc8514430
_end(cab883c0,cab883c4,ca9417d0,ca9417d8,0,...) at 0xca6dc000
_end(cc67c4e0,cc67c4e4,cd6fd000,cd6fd008,0,...) at 0xcc8abc90
_end(cd3a9120,cd3a9124,cd3b1320,cd3b1328,0,...) at 0xcad68218
_end(cd130c60,cd130c64,d00ca320,d00ca328,0,...) at 0xca71e860
_end(cbcac240,cbcac244,cbf6e4b0,cbf6e4b8,0,...) at 0xcd472a78
[...]

-- 
wbr,
pluknet