[Bug 204081] Uninterruptible sleep under 10.2-RELEASE
bugzilla-noreply at freebsd.org
bugzilla-noreply at freebsd.org
Wed Oct 28 04:52:49 UTC 2015
https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=204081
Bug ID: 204081
Summary: Uninterruptible sleep under 10.2-RELEASE
Product: Base System
Version: 10.2-RELEASE
Hardware: amd64
OS: Any
Status: New
Severity: Affects Some People
Priority: ---
Component: kern
Assignee: freebsd-bugs at FreeBSD.org
Reporter: nick.frampton at akips.com
We have a process which randomly gets stuck in uninterruptible sleep.
The process is a large scale ping and SNMP poller, using libpcap, raw ICMP
socket, and UDP socket.
If sendto(2) fails, we call usleep(50000) and retry. We suspect (but can't
confirm) that the process is stuck in this sleep.
We have not been able to reliably reproduce this problem. It has happened on a
customer's server and in our test lab. The process is still currently stuck in
this state and we can provide additional debug info if requested.
# freebsd-version -ku; uname -apKU
10.2-RELEASE-p3
10.2-RELEASE-p3
FreeBSD shed163.akips.com 10.2-RELEASE-p3 FreeBSD 10.2-RELEASE-p3 #0 r287883:
Thu Sep 17 05:59:12 UTC 2015
root at shed31.akips.com:/usr/obj/usr/src/sys/GENERIC amd64 amd64 1002000 1002000
# top -b
last pid: 27815; load averages: 1.79, 1.29, 1.23 up 12+18:46:53
14:50:31
107 processes: 4 running, 98 sleeping, 5 zombie
Mem: 723M Active, 183M Inact, 2937M Wired, 2400K Cache, 435M Buf, 106M Free
ARC: 2496M Total, 134M MFU, 2342M MRU, 100K Anon, 11M Header, 10M Other
Swap: 4096M Total, 524M Used, 3572M Free, 12% Inuse
PID USERNAME THR PRI NICE SIZE RES STATE C TIME WCPU COMMAND
20499 akips 1 31 0 117M 71800K RUN 1 5:21 6.05% perl
19844 akips 1 23 0 70948K 21456K RUN 0 11:41 4.05% perl
19845 akips 1 22 0 750M 45900K select 0 9:53 3.08% nm-db
20500 akips 1 21 0 802M 104M select 1 1:06 1.46% nm-db
24516 akips 1 20 0 74020K 5352K select 0 70:06 1.07% nm-httpd
24464 akips 1 20 0 73948K 3308K RUN 0 41:37 0.59% nm-timed
24467 root 1 20 0 421M 63696K select 1 52:24 0.29% nm-joatd
24525 akips 1 20 0 104M 21600K select 0 85:04 0.20%
nm-flow-collector
24528 akips 1 4 0 782M 77196K 0 222:50 0.00% nm-poller
24511 akips 1 20 0 123M 3512K select 0 18:14 0.00% nm-msgd
24538 akips 1 20 0 98M 28432K select 1 18:05 0.00%
nm-flow-meter
24510 akips 1 20 0 750M 269M nanslp 0 17:34 0.00%
nm-poller-helper
24536 akips 1 20 0 98M 27008K select 0 15:44 0.00%
nm-flow-meter
24512 root 1 20 0 21948K 2652K nanslp 1 13:45 0.00% top
24537 akips 1 20 0 92408K 18668K select 0 6:37 0.00%
nm-flow-meter
24527 akips 1 20 0 750M 43756K select 0 4:58 0.00%
nm-snmp-trapd
24544 akips 1 20 0 88312K 13080K select 1 3:59 0.00%
nm-flow-meter
24493 akips 1 20 0 78056K 3652K select 0 3:56 0.00%
nm-fifo-tee
# procstat -kk 24528
PID TID COMM TDNAME KSTACK
24528 100360 nm-poller - mi_switch+0xe1
sleepq_timedwait_sig+0x8b _cv_timedwait_sig_sbt+0x18b seltdwait+0xa4
kern_select+0x8fa sys_select+0x54 amd64_syscall+0x357 Xfast_syscall+0xfb
# ps -o user,pid,ppid,pending,caught,ignored,blocked,stat,wchan 24528
USER PID PPID PENDING CAUGHT IGNORED BLOCKED STAT WCHAN
akips 24528 1 0 f008568b 8408000 60085003 Ds -
# ps -o pid,paddr 24528
PID PADDR
24528 fffff8009d563000
(kgdb) p *(struct proc *)0xfffff8009d563000
$1 = {p_list = {le_next = 0xfffff800194339d0, le_prev = 0xfffff800133da000},
p_threads = {
tqh_first = 0xfffff8009d566000, tqh_last = 0xfffff8009d566010}, p_slock =
{lock_object = {
lo_name = 0xffffffff80fc8265 "process slock", lo_flags = 720896, lo_data
= 0,
lo_witness = 0x0}, mtx_lock = 4}, p_ucred = 0xfffff80013daf300, p_fd =
0xfffff800c19f8000,
p_fdtol = 0x0, p_stats = 0xfffff80013c1b200, p_limit = 0xfffff800ac7d2400,
p_limco = {c_links = {
le = {le_next = 0x0, le_prev = 0x0}, sle = {sle_next = 0x0}, tqe =
{tqe_next = 0x0,
tqe_prev = 0x0}}, c_time = 0, c_precision = 0, c_arg = 0x0, c_func = 0,
c_lock = 0xfffff8009d563100, c_flags = 0, c_iflags = 0, c_cpu = 0},
p_sigacts = 0xfffff800320aa000, p_flag = 268435712, p_flag2 = 0, p_state =
PRS_NORMAL,
p_pid = 24528, p_hash = {le_next = 0x0, le_prev = 0xfffffe00006f1e80},
p_pglist = {le_next = 0x0,
le_prev = 0xfffff8007bd35090}, p_pptr = 0xfffff800029724e8, p_sibling = {
le_next = 0xfffff800194339d0, le_prev = 0xfffff800029725e0}, p_children =
{lh_first = 0x0},
p_mtx = {lock_object = {lo_name = 0xffffffff80fc8258 "process lock", lo_flags
= 21168128,
lo_data = 0, lo_witness = 0x0}, mtx_lock = 4}, p_ksi =
0xfffff80013301070, p_sigqueue = {
sq_signals = {__bits = {0, 0, 0, 0}}, sq_kill = {__bits = {0, 0, 0, 0}},
sq_list = {
tqh_first = 0x0, tqh_last = 0xfffff8009d563148}, sq_proc =
0xfffff8009d563000, sq_flags = 1},
p_oppid = 0, p_vmspace = 0xfffff801062e7380, p_swtick = 686191978,
p_realtimer = {it_interval = {
tv_sec = 0, tv_usec = 0}, it_value = {tv_sec = 0, tv_usec = 0}}, p_ru =
{ru_utime = {
tv_sec = 0, tv_usec = 0}, ru_stime = {tv_sec = 0, tv_usec = 0}, ru_maxrss
= 0, ru_ixrss = 0,
ru_idrss = 0, ru_isrss = 0, ru_minflt = 0, ru_majflt = 0, ru_nswap = 0,
ru_inblock = 0,
ru_oublock = 0, ru_msgsnd = 0, ru_msgrcv = 0, ru_nsignals = 0, ru_nvcsw =
0, ru_nivcsw = 0},
p_rux = {rux_runtime = 22692949475677, rux_uticks = 290352, rux_sticks =
1363204, rux_iticks = 0,
rux_uu = 2347747468, rux_su = 11022676790, rux_tu = 13370422433}, p_crux =
{rux_runtime = 0,
rux_uticks = 0, rux_sticks = 0, rux_iticks = 0, rux_uu = 0, rux_su = 0,
rux_tu = 0},
p_profthreads = 0, p_exitthreads = 0, p_traceflag = 0, p_tracevp = 0x0,
p_tracecred = 0x0,
p_textvp = 0xfffff800cfa9d588, p_lock = 0, p_sigiolst = {slh_first = 0x0},
p_sigparent = 20,
p_sig = 0, p_code = 0, p_stops = 0, p_stype = 0, p_step = 0 '\0', p_pfsflags
= 0 '\0',
p_nlminfo = 0x0, p_aioinfo = 0x0, p_singlethread = 0x0, p_suspcount = 0,
p_xthread = 0x0,
p_boundary_count = 0, p_pendingcnt = 0, p_itimers = 0x0, p_procdesc = 0x0,
p_magic = 3203398350,
p_osrel = 1002000, p_comm =
"nm-poller\000\000\000\000\000\000\000\000\000\000",
p_pgrp = 0xfffff8007bd35080, p_sysent = 0xffffffff814d4388, p_args =
0xfffff80002b73360,
p_cpulimit = 9223372036854775807, p_nice = 0 '\0', p_fibnum = 0, p_xstat = 0,
p_klist = {
kl_list = {slh_first = 0x0}, kl_lock = 0xffffffff80907d90
<knlist_mtx_lock>,
kl_unlock = 0xffffffff80907dd0 <knlist_mtx_unlock>,
kl_assert_locked = 0xffffffff80907e10 <knlist_mtx_assert_locked>,
kl_assert_unlocked = 0xffffffff80907e20 <knlist_mtx_assert_unlocked>,
kl_lockarg = 0xfffff8009d563100}, p_numthreads = 1, p_md = {md_ldt = 0x0,
md_ldt_sd = {
sd_lolimit = 0, sd_lobase = 0, sd_type = 0, sd_dpl = 0, sd_p = 0,
sd_hilimit = 0, sd_xx0 = 0,
sd_gran = 0, sd_hibase = 0, sd_xx1 = 0, sd_mbz = 0, sd_xx2 = 0}},
p_itcallout = {c_links = {
le = {le_next = 0x0, le_prev = 0x0}, sle = {sle_next = 0x0}, tqe =
{tqe_next = 0x0,
tqe_prev = 0x0}}, c_time = 0, c_precision = 0, c_arg = 0x0, c_func = 0,
c_lock = 0xfffff8009d563100, c_flags = 0, c_iflags = 0, c_cpu = 0},
p_acflag = 1, p_peers = 0x0,
p_leader = 0xfffff8009d563000, p_emuldata = 0x0, p_label = 0x0, p_sched =
0xfffff8009d5634e8,
p_ktr = {stqh_first = 0x0, stqh_last = 0xfffff8009d563448}, p_mqnotifier =
{lh_first = 0x0},
p_dtrace = 0xfffff8008855f100, p_pwait = {cv_description = 0xffffffff80fc8c59
"ppwait",
cv_waiters = 0}, p_dbgwait = {cv_description = 0xffffffff80fc8c60
"dbgwait", cv_waiters = 0},
p_prev_runtime = 0, p_racct = 0x0, p_throttled = 0 '\0', p_orphan = {le_next
= 0x0,
le_prev = 0x0}, p_orphans = {lh_first = 0x0}, p_treeflag = 0, p_reaper =
0xfffff800029724e8,
p_reaplist = {lh_first = 0x0}, p_reapsibling = {le_next = 0xfffff800194339d0,
le_prev = 0xfffff800133da4d0}, p_reapsubtree = 16}
(kgdb) p/x *(((struct proc *)0xfffff8009d563000)->p_threads.tqh_first)
$2 = {td_lock = 0xffffffff815cb380, td_proc = 0xfffff8009d563000, td_plist =
{tqe_next = 0x0,
tqe_prev = 0xfffff8009d563010}, td_runq = {tqe_next = 0x0, tqe_prev =
0xffffffff815cb5c8},
td_slpq = {tqe_next = 0x0, tqe_prev = 0xfffff800027cdc80}, td_lockq =
{tqe_next = 0x0,
tqe_prev = 0xfffffe0122b4e638}, td_hash = {le_next = 0x0, le_prev =
0xfffffe0000866040},
td_cpuset = 0xfffff8000296c3a8, td_sel = 0xfffff80106647300, td_sleepqueue =
0xfffff800027cdc80,
td_turnstile = 0xfffff8009d5b5a80, td_rlqe = 0xfffff80013434d70, td_umtxq =
0xfffff80013c2cb80,
td_tid = 0x18808, td_sigqueue = {sq_signals = {__bits = {0x0, 0x0, 0x0,
0x0}}, sq_kill = {
__bits = {0x0, 0x0, 0x0, 0x0}}, sq_list = {tqh_first = 0x0, tqh_last =
0xfffff8009d5660b8},
sq_proc = 0xfffff8009d563000, sq_flags = 0x1}, td_lend_user_pri = 0xff,
td_flags = 0x14,
td_inhibitors = 0x2, td_pflags = 0x0, td_dupfd = 0x0, td_sqqueue = 0x0,
td_wchan = 0x0,
td_wmesg = 0x0, td_lastcpu = 0x0, td_oncpu = 0xff, td_owepreempt = 0x0,
td_tsqueue = 0x0,
td_locks = 0xca3f, td_rw_rlocks = 0x0, td_lk_slocks = 0x0, td_stopsched =
0x0, td_blocked = 0x0,
td_lockname = 0x0, td_contested = {lh_first = 0x0}, td_sleeplocks = 0x0,
td_intr_nesting_level = 0x0, td_pinned = 0x0, td_ucred = 0xfffff80013daf300,
td_estcpu = 0x0,
td_slptick = 0x0, td_blktick = 0x0, td_swvoltick = 0x36cff98d, td_cow = 0x22,
td_ru = {ru_utime = {
tv_sec = 0x0, tv_usec = 0x0}, ru_stime = {tv_sec = 0x0, tv_usec = 0x0},
ru_maxrss = 0x37b40,
ru_ixrss = 0x2f4f0180, ru_idrss = 0xc3b0abce0, ru_isrss = 0xc9d9a00,
ru_minflt = 0x20b07,
ru_majflt = 0x3df, ru_nswap = 0x0, ru_inblock = 0x0, ru_oublock = 0x1,
ru_msgsnd = 0x4e8632d,
ru_msgrcv = 0x2385fa8, ru_nsignals = 0x0, ru_nvcsw = 0x2d34637, ru_nivcsw =
0xa450a3}, td_rux = {
rux_runtime = 0x14a39d31415d, rux_uticks = 0x46e30, rux_sticks = 0x14cd04,
rux_iticks = 0x0,
rux_uu = 0x8befc16a, rux_su = 0x29100b336, rux_tu = 0x31cf074a1},
td_incruntime = 0x0,
td_runtime = 0x14a39d31415d, td_pticks = 0x0, td_sticks = 0x0, td_iticks =
0x0, td_uticks = 0x0,
td_intrval = 0x0, td_oldsigmask = {__bits = {0x0, 0x0, 0x0, 0x0}},
td_generation = 0x37796da,
td_sigstk = {ss_sp = 0x0, ss_size = 0x0, ss_flags = 0x4}, td_xsig = 0x0,
td_profil_addr = 0x0,
td_profil_ticks = 0x0, td_name = {0x6e, 0x6d, 0x2d, 0x70, 0x6f, 0x6c, 0x6c,
0x65, 0x72,
0x0 <repeats 11 times>}, td_fpop = 0x0, td_dbgflags = 0x0, td_dbgksi =
{ksi_link = {
tqe_next = 0x0, tqe_prev = 0x0}, ksi_info = {si_signo = 0x0, si_errno =
0x0, si_code = 0x0,
si_pid = 0x0, si_uid = 0x0, si_status = 0x0, si_addr = 0x0, si_value =
{sival_int = 0x0,
sival_ptr = 0x0, sigval_int = 0x0, sigval_ptr = 0x0}, _reason = {_fault
= {_trapno = 0x0},
_timer = {_timerid = 0x0, _overrun = 0x0}, _mesgq = {_mqd = 0x0}, _poll
= {_band = 0x0},
__spare__ = {__spare1__ = 0x0, __spare2__ = {0x0, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0}}}},
ksi_flags = 0x0, ksi_sigq = 0x0}, td_ng_outbound = 0x0, td_osd =
{osd_nslots = 0x0,
osd_slots = 0x0, osd_next = {le_next = 0x0, le_prev = 0x0}},
td_map_def_user = 0x0,
td_dbg_forked = 0x0, td_vp_reserv = 0x0, td_no_sleeping = 0x0, td_dom_rr_idx
= 0x0, td_sigmask = {
__bits = {0x60085003, 0x0, 0x0, 0x0}}, td_rqindex = 0x1a, td_base_pri =
0x68,
td_priority = 0x68, td_pri_class = 0x3, td_user_pri = 0x7f, td_base_user_pri
= 0x7f,
td_pcb = 0xfffffe0122aa4cc0, td_state = 0x1, td_retval = {0x0, 0x0},
td_slpcallout = {c_links = {
le = {le_next = 0x0, le_prev = 0xfffffe0000880840}, sle = {sle_next =
0x0}, tqe = {
tqe_next = 0x0, tqe_prev = 0xfffffe0000880840}}, c_time =
0xe082d08b24e81,
c_precision = 0x12381, c_arg = 0xfffff8009d566000, c_func =
0xffffffff809901f0, c_lock = 0x0,
c_flags = 0x0, c_iflags = 0x110, c_cpu = 0x1}, td_frame =
0xfffffe0122aa4c00,
td_kstack_obj = 0xfffff8009d709700, td_kstack = 0xfffffe0122aa1000,
td_kstack_pages = 0x4,
td_critnest = 0x1, td_md = {md_spinlock_count = 0x1, md_saved_flags = 0x246,
md_spurflt_addr = 0x801018000}, td_sched = 0xfffff8009d566470, td_ar = 0x0,
td_lprof = {{
lh_first = 0x0}, {lh_first = 0x0}}, td_dtrace = 0xfffff8001394db00,
td_errno = 0x0,
td_vnet = 0x0, td_vnet_lpush = 0x0, td_intr_frame = 0x0, td_rfppwait_p =
0xfffff8001334c4e8,
td_ma = 0x0, td_ma_cnt = 0x0, td_su = 0x0}
--
You are receiving this mail because:
You are the assignee for the bug.
More information about the freebsd-bugs
mailing list