5.4 hangs on disk IO

Douglas K. Rand rand at meridian-enviro.com
Fri Oct 7 12:39:41 PDT 2005


I've got 2 FreeBSD 5.4 systems that seem to get stuck doing disk
IO. When the system gets hung, it seems to refuse to do any disk
io. It will continue to respond to pings, and the tty driver on the
serial console continues to work and echo characters. But all the
processes seem to get stuck in the state "ufs". On the serial console
I tried sync, which then hangs, and ^T produces:

athearn# sync
load: 0.66  cmd: csh 4138 [ufs] 0.00u 0.00s 0% 1748k

One of the systems emits these error messages on the console:

swap_pager: indefinite wait buffer: device: da0s1a, blkno: 2, size: 4096
swap_pager: indefinite wait buffer: device: da0s1h, blkno: 6, size: 4096
swap_pager: indefinite wait buffer: device: da0s1h, blkno: 7, size: 4096
swap_pager: indefinite wait buffer: device: da0s1h, blkno: 15, size: 4096
swap_pager: indefinite wait buffer: device: da0s1a, blkno: 26, size: 4096

Other than that difference, both systems hang the same way.

The interesting part is that it only seems to happen when I run
amd. With amd running one of our users can hang the system in about
5-10 minutes of heavy disk traffic to a LOCAL disk. The local disk
with the heaviest traffic is "behind" a amd managed symlink. If I
don't run amd and do all the NFS mounts by hand and build the symlink
by hand, the system runs fine. I've tried both the stock amd that
comes with 5.4, the fairly outdated 6.0.10-20040513, and the brand new
6.1.2.1 from ports. 

From the kernel debugger I can't even panic or call boot(0) with out
errors. 

I can easily reproduce the hang, so if there is any suggestions for
things to poke at with the kernel debugger on the serial console, let
me know.

Here's the result of ps from a hung system, along with the errors from
panic and boot(0):

db> ps
  pid   proc     uid  ppid  pgrp  flag   stat  wmesg    wchan  cmd
 4138 c34051c4    0   530  4138 4000012 [SLPQ ufs 0xc346b2bc][SLP] csh
 4137 c3e8be20 1002   740   735 4000000 [SLPQ sbwait 0xc366fe84][SLP] perl5.8.7
 4136 c3e8bc5c 1002   741   735 4000000 [SLPQ sbwait 0xc3a95d40][SLP] perl5.8.7
 4135 c3e071c4 1002   743   735 0004000 [SLPQ nfs 0xc35a7e14][SLP] mkdir
 4134 c3d8e1c4 1002   749   735 0004000 [SLPQ ufs 0xc346b2bc][SLP] perl5.8.7
 4132 c3e6154c 1002   739   735 0004000 [SLPQ nfs 0xc35a7e14][SLP] remap
 2355 c3714a98 1000   682  2355 0004002 [SLPQ select 0xc06787c4][SLP] top
  749 c3ba3c5c 1002     1   735 0004002 [SLPQ wait 0xc3ba3c5c][SLP] tclsh8.4
  748 c348f54c 1002     1   735 0004002 [SLPQ accept 0xc36a6a5a][SLP] perl5.8.7
  747 c3710a98 1002     1   735 0004002 [SLPQ accept 0xc3a6154a][SLP] perl5.8.7
  746 c370fa98 1002     1   735 0004002 [SLPQ accept 0xc374e17e][SLP] perl5.8.7
  745 c3ba3e20 1002     1   735 0004002 [SLPQ accept 0xc374e68e][SLP] perl5.8.7
  744 c3ba6000 1002     1   735 0004002 [SLPQ accept 0xc35ac7d2][SLP] perl5.8.7
  743 c3ba61c4 1002     1   735 0004002 [SLPQ wait 0xc3ba61c4][SLP] perl5.8.7
  742 c3ba6388 1002     1   735 0004002 [SLPQ accept 0xc366f406][SLP] perl5.8.7
  741 c34b6e20 1002     1   735 0004002 [SLPQ wait 0xc34b6e20][SLP] perl5.8.7
  740 c34b6388 1002     1   735 0004002 [SLPQ wait 0xc34b6388][SLP] perl5.8.7
  739 c34b6c5c 1002     1   735 0004002 [SLPQ wait 0xc34b6c5c][SLP] perl5.8.7
  682 c34ef000 1000   681   682 0004002 [SLPQ pause 0xc34ef038][SLP] zsh
  681 c340954c 1000   678   678 0000100 [SLPQ select 0xc06787c4][SLP] sshd
  678 c348ca98    0   442   678 0000100 [SLPQ sbwait 0xc3a616ec][SLP] sshd
  530 c3710e20    0   529   530 0004002 [SLPQ ppwait 0xc3710e20][SLP] csh
  529 c370f388    0     1   529 0004102 [SLPQ wait 0xc370f388][SLP] login
  528 c3710388    0     1   528 0004002 [SLPQ ttyin 0xc31d0a10][SLP] getty
  527 c348c1c4    0     1   527 0004002 [SLPQ ttyin 0xc31b8410][SLP] getty
  526 c348f388    0     1   526 0004002 [SLPQ ttyin 0xc31b8010][SLP] getty
  525 c34ef8d4    0     1   525 0004002 [SLPQ ttyin 0xc3166c10][SLP] getty
  524 c34b6a98    0     1   524 0004002 [SLPQ ttyin 0xc3166210][SLP] getty
  523 c348f8d4    0     1   523 0004002 [SLPQ ttyin 0xc3166410][SLP] getty
  522 c348ce20    0     1   522 0004002 [SLPQ ttyin 0xc3166610][SLP] getty
  521 c34f3e20    0     1   521 0004002 [SLPQ ttyin 0xc3166810][SLP] getty
  491 c3405388    0     1   490 0000000 [SLPQ ufs 0xc346b2bc][SLP] snmpd
  464 c3710710    0     1   464 0000000 [SLPQ ufs 0xc346b2bc][SLP] cron
  452 c34b6710   25     1   452 0000100 [SLPQ pause 0xc34b6748][SLP] sendmail
  448 c3714000    0     1   448 0000100 [SLPQ pause 0xc3714038][SLP] sendmail
  442 c34f38d4    0     1   442 0000100 [SLPQ select 0xc06787c4][SLP] sshd
  427 c3409710    0     1   427 0000000 [SLPQ select 0xc06787c4][SLP] ntpd
  382 c3167c5c    0   378   378 0000000 [SLPQ - 0xc3383c00][SLP] nfsd
  381 c34ef388    0   378   378 0000000 [SLPQ - 0xc33d7200][SLP] nfsd
  380 c34f354c    0   378   378 0000000 [SLPQ - 0xc33f6400][SLP] nfsd
  379 c34b654c    0   378   378 0000000 [SLPQ - 0xc338f000][SLP] nfsd
  378 c3405710    0     1   378 0000000 [SLPQ accept 0xc365403a][SLP] nfsd
  376 c34f3388    0     1   376 0000000 [SLPQ select 0xc06787c4][SLP] mountd
  325 c348fe20    0     1   325 0100000 [SLPQ ufs 0xc346b2bc][SLP] amd
  302 c34ef710    0     1   302 0000000 [SLPQ ufs 0xc346b2bc][SLP] ypbind
  289 c34efe20    0     1   289 0000000 [SLPQ ufs 0xc346b2bc][SLP] rpcbind
  271 c3409e20    0     1   271 0000000 [SLPQ select 0xc06787c4][SLP] syslogd
  253 c348fa98    0     1   253 0000000 [SLPQ select 0xc06787c4][SLP] devd
   66 c3405a98    0     0     0 0000204 [SLPQ - 0xe6b9cd14][SLP] schedcpu
   65 c3405c5c    0     0     0 0000204 [SLPQ - 0xc067b1ec][SLP] nfsiod 3
   64 c3405e20    0     0     0 0000204 [SLPQ - 0xc067b1e8][SLP] nfsiod 2
   63 c3409000    0     0     0 0000204 [SLPQ - 0xc067b1e4][SLP] nfsiod 1
   62 c34091c4    0     0     0 0000204 [SLPQ - 0xc067b1e0][SLP] nfsiod 0
   61 c3409388    0     0     0 0000204 [SLPQ syncer 0xc064e30c][SLP] syncer
   60 c31131c4    0     0     0 0000204 [SLPQ vlruwt 0xc31131c4][SLP] vnlru
   59 c3113388    0     0     0 0000204 [SLPQ psleep 0xc0678cac][SLP] bufdaemon
   58 c311354c    0     0     0 000020c [SLPQ pgzero 0xc0681a54][SLP] pagezero
   57 c3113710    0     0     0 0000204 [SLPQ psleep 0xc0681aa8][SLP] vmdaemon
   56 c31138d4    0     0     0 0000204 [SLPQ psleep 0xc0681a64][SLP] pagedaemon
   55 c3113a98    0     0     0 0000204 [IWAIT] swi0: sio
    9 c3113c5c    0     0     0 0000204 [SLPQ actask 0xc0753bec][SLP] acpi_task2
    8 c3113e20    0     0     0 0000204 [SLPQ actask 0xc0753bec][SLP] acpi_task1
    7 c3167000    0     0     0 0000204 [SLPQ actask 0xc0753bec][SLP] acpi_task0
   54 c31671c4    0     0     0 0000204 [IWAIT] swi6:+
   53 c3167388    0     0     0 0000204 [IWAIT] swi6: task queue
   52 c316754c    0     0     0 0000204 [IWAIT] swi6: acpitaskq
   51 c3167710    0     0     0 0000204 [IWAIT] swi3: cambio
   50 c31678d4    0     0     0 0000204 [IWAIT] swi2: camnet
    6 c30fba98    0     0     0 0000204 [SLPQ - 0xc3158140][SLP] kqueue taskq
   49 c30fbc5c    0     0     0 0000204 [IWAIT] swi6:+
    5 c30fbe20    0     0     0 0000204 [SLPQ - 0xc3158200][SLP] thread taskq
   48 c310f000    0     0     0 0000204 [SLPQ - 0xc0646180][SLP] yarrow
    4 c310f1c4    0     0     0 0000204 [SLPQ - 0xc0648c08][SLP] g_down
    3 c310f388    0     0     0 0000204 [SLPQ - 0xc0648c04][SLP] g_up
    2 c310f54c    0     0     0 0000204 [SLPQ - 0xc0648bfc][SLP] g_event
   47 c310f710    0     0     0 0000204 [IWAIT] swi1: net
   46 c310f8d4    0     0     0 0000204 [IWAIT] swi4: vm
   45 c310fa98    0     0     0 000020c [IWAIT] swi5: clock sio
   44 c310fc5c    0     0     0 0000204 [IWAIT] irq0: clk
   43 c310fe20    0     0     0 0000204 [IWAIT] irq31:
   42 c3113000    0     0     0 0000204 [IWAIT] irq30:
   41 c30ef54c    0     0     0 0000204 [IWAIT] irq29:
   40 c30ef710    0     0     0 0000204 [IWAIT] irq28: iir0
   39 c30ef8d4    0     0     0 0000204 [IWAIT] irq27:
   38 c30efa98    0     0     0 0000204 [IWAIT] irq26:
   37 c30efc5c    0     0     0 0000204 [IWAIT] irq25: bge1
   36 c30efe20    0     0     0 0000204 [IWAIT] irq24: bge0
   35 c30fb000    0     0     0 0000204 [IWAIT] irq23:
   34 c30fb1c4    0     0     0 0000204 [IWAIT] irq22:
   33 c30fb388    0     0     0 0000204 [IWAIT] irq21:
   32 c30fb54c    0     0     0 0000204 [IWAIT] irq20:
   31 c30fb710    0     0     0 0000204 [IWAIT] irq19:
   30 c30fb8d4    0     0     0 0000204 [IWAIT] irq18:
   29 c309d1c4    0     0     0 0000204 [IWAIT] irq17:
   28 c309d388    0     0     0 0000204 [IWAIT] irq16:
   27 c309d54c    0     0     0 0000204 [IWAIT] irq15: ata1
   26 c309d710    0     0     0 0000204 [IWAIT] irq14: ata0
   25 c309d8d4    0     0     0 0000204 [IWAIT] irq13:
   24 c309da98    0     0     0 0000204 [IWAIT] irq12:
   23 c309dc5c    0     0     0 0000204 [IWAIT] irq11:
   22 c309de20    0     0     0 0000204 [IWAIT] irq10:
   21 c30ef000    0     0     0 0000204 [IWAIT] irq9: acpi0
   20 c30ef1c4    0     0     0 0000204 [IWAIT] irq8: rtc
   19 c30ef388    0     0     0 0000204 [IWAIT] irq7:
   18 c3094000    0     0     0 0000204 [IWAIT] irq6:
   17 c30941c4    0     0     0 0000204 [IWAIT] irq5:
   16 c3094388    0     0     0 0000204 [IWAIT] irq4: sio0
   15 c309454c    0     0     0 0000204 [IWAIT] irq3:
   14 c3094710    0     0     0 0000204 [IWAIT] irq1: atkbd0
   13 c30948d4    0     0     0 000020c [CPU 0] idle: cpu0
   12 c3094a98    0     0     0 000020c [CPU 1] idle: cpu1
   11 c3094c5c    0     0     0 000020c [CPU 2] idle: cpu2
   10 c3094e20    0     0     0 000020c [CPU 3] idle: cpu3
    1 c309d000    0     0     1 0004200 [SLPQ wait 0xc309d000][SLP] init
    0 c0648d00    0     0     0 0000200 [SLPQ sched 0xc0648d00][SLP] swapper

With INVARIANTS and WITNESS compiled in I get:

db>  panic
panic: from debugger
cpuid = 3
panic: mi_switch: switch in a critical section
cpuid = 3
panic: mi_switch: switch in a critical section

[These two messages continue until the system resets itself.]


But with out INVARIANTS and WITNESS I get:

db> panic
panic: from debugger
cpuid = 3
KDB: stack backtrace:
kdb_backtrace(ffe787e,722dddb9,ffc09014,c3095300,c31ce200) at kdb_backtrace+0x29
mi_switch(1,0,e493da60,c04b441c,c3095300) at mi_switch+0xf6
sched_bind(c3095300,0) at sched_bind+0x44
boot(104,104,c3095300,0,c06144d0) at boot+0x64
panic(c05f6e6b,e493db40,c04374d8,c04cc3cf,0) at panic+0x1b9
db_panic(c04cc3cf,0,ffffffff,e493dab4,0) at db_panic+0xd
db_command(c063eee4,c0619b20,c06144d0,c06144d4,c05f6e79) at db_command+0x264
db_command_loop(0,0,e493db6c,e493db58,e493dba0) at db_command_loop+0x5c
db_trap(3,0,3,c3095300,e493dbec) at db_trap+0xdd
kdb_trap(3,0,e493dbf4) at kdb_trap+0x8b
trap(18,10,10,61,c31a8000) at trap+0x464
calltrap() at calltrap+0x5
--- trap 0x3, eip = 0xc04cc3cf, esp = 0xe493dc34, ebp = 0xe493dc34 ---
kdb_enter(c060dc65) at kdb_enter+0x2b
siointr1(c31a8000) at siointr1+0xd5
siointr(c31a8000) at siointr+0x5e
intr_execute_handlers(c308f490,e493dca0,4,e493dce4,c05ccf13) at intr_execute_handlers+0x7d
lapic_handle_intr(34) at lapic_handle_intr+0x2e
Xapic_isr1() at Xapic_isr1+0x33
--- interrupt, eip = 0xc072c5a9, esp = 0xe493dce4, ebp = 0xe493dce4 ---
acpi_cpu_c1(2aca5ff6,5ec6eb58,e493dd04,c3095300,4) at acpi_cpu_c1+0x5
acpi_cpu_idle(e493dd20,c04a1119,c3094e20,c04a1080,0) at acpi_cpu_idle+0xcf
cpu_idle(c3094e20,c04a1080,0,e493dd34,c04a0e19) at cpu_idle+0x28
idle_proc(0,e493dd48) at idle_proc+0x99
fork_exit(c04a1080,0,e493dd48) at fork_exit+0x75
fork_trampoline() at fork_trampoline+0x8
--- trap 0x1, eip = 0, esp = 0xe493dd7c, ebp = 0 ---

db> call boot(0)
KDB: stack backtrace:
kdb_backtrace(80ae517e,df7fe2e7,ffc09014,c3095300,c06144d0) at kdb_backtrace+0x29
mi_switch(1,0,e493da20,c04b441c,c3095300) at mi_switch+0xf6
sched_bind(c3095300,0) at sched_bind+0x44
boot(0,0,0,0,0,0,0,0,0,0) at boot+0x64
db_fncall(0,0,0,e493dab4,0) at db_fncall+0xf6
db_command(c063eee4,c0619b20,c06144d0,c06144d4,c05f6e79) at db_command+0x264
db_command_loop(0,0,e493db6c,e493db58,e493dba0) at db_command_loop+0x5c
db_trap(3,0,3,c3095300,e493dbec) at db_trap+0xdd
kdb_trap(3,0,e493dbf4) at kdb_trap+0x8b
trap(18,10,10,61,c31a8000) at trap+0x464
calltrap() at calltrap+0x5
--- trap 0x3, eip = 0xc04cc3cf, esp = 0xe493dc34, ebp = 0xe493dc34 ---
kdb_enter(c060dc65) at kdb_enter+0x2b
siointr1(c31a8000) at siointr1+0xd5
siointr(c31a8000) at siointr+0x5e
intr_execute_handlers(c308f490,e493dca0,4,e493dce4,c05ccf13) at intr_execute_handlers+0x7d
lapic_handle_intr(34) at lapic_handle_intr+0x2e
Xapic_isr1() at Xapic_isr1+0x33
--- interrupt, eip = 0xc072c5a9, esp = 0xe493dce4, ebp = 0xe493dce4 ---
acpi_cpu_c1(2aca5ff6,5ec6eb58,e493dd04,c3095300,4) at acpi_cpu_c1+0x5
Macpi_cpu_idle(e493dd20,c04a1119,c3094e20,c04a1080,0) at acpi_cpu_idle+0xcf
cpu_idle(c3094e20,c04a1080,0,e493dd34,c04a0e19) at cpu_idle+0x28
idle_proc(0,e493dd48) at idle_proc+0x99
fork_exit(c04a1080,0,e493dd48) at fork_exit+0x75
fork_trampoline() at fork_trampoline+0x8
--- trap 0x1, eip = 0, esp = 0xe493dd7c, ebp = 0 ---

db> call cpu_reset()
cpu_reset called on cpu#3
cpu_reset: Restarting BSP
cpu_reset_proxy: Stopped CPU 3


And my kernel config file:

machine         i386
cpu             I686_CPU
ident           KATO

makeoptions     MODULES_OVERRIDE="acpi"

options         SCHED_4BSD
options         INET
options         FFS
options         SOFTUPDATES
options         UFS_ACL
options         UFS_DIRHASH
options         NFSCLIENT
options         NFSSERVER
options         CD9660
options         PROCFS
options         PSEUDOFS
options         COMPAT_43
options         SCSI_DELAY=500
options         ADAPTIVE_GIANT
options         DDB
options         KDB
options         KDB_UNATTENDED
options         ALT_BREAK_TO_DEBUGGER
options         INVARIANTS
options         INVARIANT_SUPPORT
options         WITNESS
options         INCLUDE_CONFIG_FILE
options         MAXDSIZ=(1024*1024*1024)
options         DFLDSIZ=(256*1024*1024)
options         SMP

device          apic
device          isa
device          pci

device          ata
device          atapicd

device          iir
device          scbus
device          da
device          pass
device          ips

device          atkbdc
device          atkbd
device          vga
device          sc

device          npx

device          sio

device          miibus
device          bge

device          loop
device          mem
device          io
device          random
device          ether
device          pty
device          bpf


More information about the freebsd-stable mailing list