Occasionally we see a process get stuck in an unkillable state and the only solution is a hard reboot.
Occasionally == once every two weeks across 60+ servers, which are spread across the globe in customer sites. We have no remote access to these boxes. The process that most often that gets stuck, but not limited to, is a large scale Ping/SNMP poller. It is a fairly simplistic C program that just fires out lots of ping (raw ICMP socket) and SNMP (UDP socket) requests asynchronously. We've managed to trap the problem a few times on a test server running in VirtualBox, but it also occurs on customer sites who run VMware, Hyper-V, QEMU and on bare metal. We raise this PR https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=204081 but suspect it is a similar/same issue as https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=200992 This is the info we've gathered from the most recent time it has occurred: # uname -a FreeBSD shed153.akips.com 10.2-RELEASE-p12 FreeBSD 10.2-RELEASE-p12 #0 r295070: Sat Jan 30 20:03:44 UTC 2016 r...@shed21.akips.com:/usr/obj/usr/src/sys/GENERIC amd64 The nm-poller has no state in top for some reason ?? last pid: 1847; load averages: 0.62, 1.20, 1.33 up 13+16:06:04 13:36:46 103 processes: 1 running, 102 sleeping CPU: 1.0% user, 0.0% nice, 4.3% system, 0.0% interrupt, 94.7% idle Mem: 650M Active, 541M Inact, 2527M Wired, 16M Cache, 417M Buf, 217M Free ARC: 2087M Total, 102M MFU, 1968M MRU, 18K Anon, 9409K Header, 9088K Other Swap: 4096M Total, 256M Used, 3840M Free, 6% Inuse PID USERNAME THR PRI NICE SIZE RES STATE C TIME WCPU COMMAND 1013 akips 1 20 0 74076K 5544K select 1 195:41 0.59% nm-httpd 1003 akips 1 20 0 164M 54328K select 0 236:18 0.49% nm-flow-collector 888 root 1 20 0 101M 14920K select 0 163:56 0.39% nm-joatd 885 akips 1 20 0 74004K 3092K nanslp 1 116:52 0.29% nm-timed 1014 akips 1 4 0 851M 104M 0 18.0H 0.00% nm-poller 1086 akips 1 20 0 21940K 2680K nanslp 0 66:25 0.00% top 1015 akips 1 20 0 819M 256M nanslp 1 56:45 0.00% nm-poller-db 1023 akips 1 20 0 114M 44760K select 0 55:00 0.00% nm-flow-meter 1005 akips 1 20 0 159M 4172K select 0 51:00 0.00% nm-msgd 1025 akips 1 20 0 114M 45644K select 0 44:22 0.00% nm-flow-meter 1012 akips 1 20 0 60360K 5132K piperd 1 20:08 0.00% perl 1027 akips 1 20 0 110M 34564K select 1 18:58 0.00% nm-flow-meter 997 akips 1 20 0 819M 27600K select 0 12:59 0.00% nm-snmp-trapd 991 akips 1 20 0 78104K 5384K select 1 10:53 0.00% nm-fifo-tee 989 akips 1 20 0 78104K 5764K select 1 10:34 0.00% nm-fifo-tee 990 akips 1 20 0 78104K 5496K select 0 10:31 0.00% nm-fifo-tee 1047 akips 1 20 0 102M 29108K select 0 10:25 0.00% nm-flow-meter 1111 akips 1 20 0 102M 36000K select 0 9:18 0.00% nm-flow-meter 1231 akips 1 20 0 102M 35952K select 1 9:17 0.00% nm-flow-meter 1239 akips 1 20 0 102M 33132K select 0 8:51 0.00% nm-flow-meter 1240 akips 1 20 0 102M 33132K select 1 8:51 0.00% nm-flow-meter 1002 akips 1 20 0 74016K 3480K select 1 8:50 0.00% nm-syslogd 1234 akips 1 20 0 102M 35920K select 1 8:49 0.00% nm-flow-meter 1243 akips 1 20 0 102M 33148K select 0 8:46 0.00% nm-flow-meter 1039 akips 1 20 0 820M 31388K select 0 8:46 0.00% nm-db 1233 akips 1 20 0 102M 31256K select 0 8:43 0.00% nm-flow-meter 1237 akips 1 20 0 102M 33168K select 0 8:43 0.00% nm-flow-meter 1235 akips 1 20 0 102M 29040K select 1 8:41 0.00% nm-flow-meter 1259 akips 1 20 0 102M 29096K select 0 8:40 0.00% nm-flow-meter 1255 akips 1 20 0 102M 31756K select 1 8:40 0.00% nm-flow-meter 1232 akips 1 20 0 102M 31780K select 1 8:39 0.00% nm-flow-meter 1041 akips 1 20 0 820M 45284K select 0 8:34 0.00% nm-db 1044 akips 1 20 0 820M 26172K select 1 8:28 0.00% nm-db 1060 akips 1 20 0 74008K 3380K select 1 8:22 0.00% nm-syslog 1077 akips 1 20 0 820M 26076K select 1 8:22 0.00% nm-db 1048 akips 1 20 0 820M 26076K select 1 8:16 0.00% nm-db 1045 akips 1 20 0 820M 27056K select 1 8:16 0.00% nm-db 1046 akips 1 20 0 820M 26156K select 1 8:16 0.00% nm-db 22541 akips 1 20 0 820M 26092K select 1 8:16 0.00% nm-db 1049 root 1 20 0 820M 26076K select 0 8:15 0.00% nm-db 1043 akips 1 20 0 820M 26076K select 0 8:15 0.00% nm-db 1006 akips 1 20 0 74004K 3364K nanslp 1 8:04 0.00% nm-timed-watcher 62906 akips 1 20 0 102M 33488K select 0 7:50 0.00% nm-flow-meter 89368 akips 1 20 0 96556K 28228K select 0 5:57 0.00% nm-flow-meter 9528 akips 1 20 0 96556K 27828K select 1 5:35 0.00% nm-flow-meter 56009 akips 1 20 0 96556K 26120K select 0 5:03 0.00% nm-flow-meter 985 akips 1 20 0 68684K 5248K nanslp 1 3:52 0.00% perl 986 akips 1 20 0 50112K 5596K nanslp 1 3:04 0.00% perl 22909 akips 1 20 0 88364K 19352K select 0 2:34 0.00% nm-flow-meter 993 akips 1 20 0 68684K 5036K nanslp 0 2:10 0.00% perl 55564 akips 1 20 0 88364K 17916K select 0 2:05 0.00% nm-flow-meter 995 akips 1 20 0 68684K 4904K nanslp 0 2:04 0.00% perl 561 root 1 20 0 21880K 13788K select 0 1:24 0.00% ntpd 994 akips 1 20 0 68684K 4552K piperd 1 0:24 0.00% perl 999 akips 1 20 0 68684K 7888K piperd 1 0:10 0.00% perl 996 akips 1 20 0 68684K 7720K piperd 1 0:09 0.00% perl 648 root 1 20 0 30336K 840K kqread 1 0:08 0.00% master 90631 akips 1 20 0 80172K 12048K select 0 0:07 0.00% nm-flow-meter 920 root 1 20 0 16612K 684K nanslp 1 0:05 0.00% cron 945 akips 1 20 0 21940K 3008K select 0 0:04 0.00% top 412 root 1 20 0 14512K 1644K select 1 0:03 0.00% syslogd 998 akips 1 20 0 68684K 4160K piperd 1 0:02 0.00% perl 98732 akips 1 20 0 820M 28780K select 1 0:02 0.00% nm-db 98844 akips 1 20 0 820M 28780K select 1 0:02 0.00% nm-db 650 postfix 1 20 0 30460K 796K kqread 1 0:01 0.00% qmgr 323 root 1 20 0 13624K 540K select 0 0:01 0.00% devd 368 root 1 20 0 14456K 1560K select 1 0:01 0.00% rtsold 1037 akips 1 20 0 86492K 6444K select 1 0:00 0.00% sshd 928 akips 1 20 0 86492K 6408K select 1 0:00 0.00% sshd 1058 akips 1 52 0 58304K 3688K piperd 0 0:00 0.00% perl 22538 akips 1 52 0 58304K 3688K piperd 0 0:00 0.00% perl 98843 akips 1 48 0 58304K 16864K piperd 1 0:00 0.00% perl 98731 akips 1 48 0 58304K 16864K piperd 0 0:00 0.00% perl 1570 root 1 20 0 35756K 12644K ttyin 1 0:00 0.00% vi 1506 akips 1 20 0 86492K 6552K select 1 0:00 0.00% sshd 1803 root 1 20 0 21940K 3132K CPU0 0 0:00 0.00% top 1096 root 1 20 0 17088K 2676K wait 1 0:00 0.00% sh 926 root 1 25 0 86492K 6352K select 0 0:00 0.00% sshd 1503 root 1 22 0 86492K 6552K select 0 0:00 0.00% sshd 1258 root 1 24 0 86492K 6452K select 0 0:00 0.00% sshd 1034 root 1 23 0 86492K 6456K select 0 0:00 0.00% sshd 1261 akips 1 20 0 86492K 6452K select 0 0:00 0.00% sshd 965 root 1 52 0 14508K 1680K ttyin 1 0:00 0.00% getty 1038 akips 1 20 0 17088K 0K wait 1 0:00 0.00% <sh> 1527 akips 1 26 0 47732K 2684K wait 0 0:00 0.00% su 1095 akips 1 21 0 47728K 0K wait 0 0:00 0.00% <su> 1278 akips 1 39 0 47728K 0K wait 1 0:00 0.00% <su> 929 akips 1 20 0 17088K 0K wait 1 0:00 0.00% <sh> 916 root 1 20 0 61224K 4300K select 0 0:00 0.00% sshd 1507 akips 1 37 0 17088K 2636K wait 1 0:00 0.00% sh 964 root 1 52 0 14508K 1680K ttyin 1 0:00 0.00% getty 1439 postfix 1 20 0 30404K 3848K kqread 1 0:00 0.00% pickup # ps auxww | grep nm-poller akips 1014 0.0 2.6 871820 106540 - Ds 10Feb16 1078:59.06 nm-poller # procstat -k 1014 PID TID COMM TDNAME KSTACK 1014 100365 nm-poller - mi_switch sleepq_timedwait_sig _cv_timedwait_sig_sbt seltdwait kern_select sys_select amd64_syscall Xfast_syscall # ps axHS -o pid,lwp,paddr,tdaddr 1014 PID LWP PADDR TDADDR 1014 100365 fffff800bb819000 fffff800bb963940 (kgdb) set print pretty on (kgdb) p *(struct proc *)0xfffff800bb819000 $4 = { p_list = { le_next = 0xfffff800bb6829d0, le_prev = 0xfffff800bb8199d0 }, p_threads = { tqh_first = 0xfffff800bb963940, tqh_last = 0xfffff800bb963950 }, p_slock = { lock_object = { lo_name = 0xffffffff80fc8245 "process slock", lo_flags = 720896, lo_data = 0, lo_witness = 0x0 }, mtx_lock = 4 }, p_ucred = 0xfffff800bb55f200, p_fd = 0xfffff800bb840000, p_fdtol = 0x0, p_stats = 0xfffff800bb408600, p_limit = 0xfffff8001339c900, p_limco = { c_links = { le = { le_next = 0x0, le_prev = 0x0 }, sle = { sle_next = 0x0 }, tqe = { tqe_next = 0x0, tqe_prev = 0x0 } }, c_time = 0, c_precision = 0, c_arg = 0x0, c_func = 0, c_lock = 0xfffff800bb819100, c_flags = 0, c_iflags = 0, c_cpu = 0 }, p_sigacts = 0xfffff800bba1b000, p_flag = 268435712, p_flag2 = 0, p_state = PRS_NORMAL, p_pid = 1014, p_hash = { le_next = 0x0, le_prev = 0xfffffe00006effb0 }, p_pglist = { le_next = 0x0, le_prev = 0xfffff800bb2c6290 }, p_pptr = 0xfffff800029724e8, p_sibling = { le_next = 0xfffff800bb458000, le_prev = 0xfffff800bb819ab8 }, p_children = { lh_first = 0x0 }, p_mtx = { lock_object = { lo_name = 0xffffffff80fc8238 "process lock", lo_flags = 21168128, lo_data = 0, lo_witness = 0x0 }, mtx_lock = 4 }, p_ksi = 0xfffff800133020e0, p_sigqueue = { sq_signals = { __bits = {0, 0, 0, 0} }, sq_kill = { __bits = {0, 0, 0, 0} }, sq_list = { tqh_first = 0x0, tqh_last = 0xfffff800bb819148 }, sq_proc = 0xfffff800bb819000, sq_flags = 1 }, p_oppid = 0, p_vmspace = 0xfffff800bb9668c0, p_swtick = 24753, p_realtimer = { it_interval = { tv_sec = 0, tv_usec = 0 }, it_value = { tv_sec = 0, tv_usec = 0 } }, p_ru = { ru_utime = { tv_sec = 0, tv_usec = 0 }, ru_stime = { tv_sec = 0, tv_usec = 0 }, ru_maxrss = 0, ru_ixrss = 0, ru_idrss = 0, ru_isrss = 0, ru_minflt = 0, ru_majflt = 0, ru_nswap = 0, ru_inblock = 0, ru_oublock = 0, ru_msgsnd = 0, ru_msgrcv = 0, ru_nsignals = 0, ru_nvcsw = 0, ru_nivcsw = 0 }, p_rux = { rux_runtime = 109968287043644, rux_uticks = 1450275, rux_sticks = 6625562, rux_iticks = 0, rux_uu = 11625969653, rux_su = 53113087346, rux_tu = 64739057000 }, p_crux = { rux_runtime = 0, rux_uticks = 0, rux_sticks = 0, rux_iticks = 0, rux_uu = 0, rux_su = 0, rux_tu = 0 }, p_profthreads = 0, p_exitthreads = 0, p_traceflag = 0, p_tracevp = 0x0, p_tracecred = 0x0, p_textvp = 0xfffff800bb42b3b0, p_lock = 0, p_sigiolst = { slh_first = 0x0 }, p_sigparent = 20, p_sig = 0, p_code = 0, p_stops = 0, p_stype = 0, p_step = 0 '\0', p_pfsflags = 0 '\0', p_nlminfo = 0x0, p_aioinfo = 0x0, p_singlethread = 0x0, p_suspcount = 0, p_xthread = 0x0, p_boundary_count = 0, p_pendingcnt = 0, p_itimers = 0x0, p_procdesc = 0x0, p_magic = 3203398350, p_osrel = 1002000, p_comm = "nm-poller\000\000\000\000\000\000\000\000\000\000", p_pgrp = 0xfffff800bb2c6280, p_sysent = 0xffffffff814d4388, p_args = 0xfffff800133796c0, p_cpulimit = 9223372036854775807, p_nice = 0 '\0', p_fibnum = 0, p_xstat = 0, p_klist = { kl_list = { slh_first = 0x0 }, kl_lock = 0xffffffff80907dc0 <knlist_mtx_lock>, kl_unlock = 0xffffffff80907e00 <knlist_mtx_unlock>, kl_assert_locked = 0xffffffff80907e40 <knlist_mtx_assert_locked>, kl_assert_unlocked = 0xffffffff80907e50 <knlist_mtx_assert_unlocked>, kl_lockarg = 0xfffff800bb819100 }, p_numthreads = 1, p_md = { md_ldt = 0x0, md_ldt_sd = { sd_lolimit = 0, sd_lobase = 0, sd_type = 0, sd_dpl = 0, sd_p = 0, sd_hilimit = 0, sd_xx0 = 0, sd_gran = 0, sd_hibase = 0, sd_xx1 = 0, sd_mbz = 0, sd_xx2 = 0 } }, p_itcallout = { c_links = { le = { le_next = 0x0, le_prev = 0x0 }, sle = { sle_next = 0x0 }, tqe = { tqe_next = 0x0, tqe_prev = 0x0 } }, c_time = 0, c_precision = 0, c_arg = 0x0, c_func = 0, c_lock = 0xfffff800bb819100, c_flags = 0, c_iflags = 0, c_cpu = 0 }, p_acflag = 1, p_peers = 0x0, p_leader = 0xfffff800bb819000, p_emuldata = 0x0, p_label = 0x0, p_sched = 0xfffff800bb8194e8, p_ktr = { stqh_first = 0x0, stqh_last = 0xfffff800bb819448 }, p_mqnotifier = { lh_first = 0x0 }, p_dtrace = 0xfffff800bb502a40, p_pwait = { cv_description = 0xffffffff80fc8c39 "ppwait", cv_waiters = 0 }, p_dbgwait = { cv_description = 0xffffffff80fc8c40 "dbgwait", cv_waiters = 0 }, p_prev_runtime = 0, p_racct = 0x0, p_throttled = 0 '\0', p_orphan = { le_next = 0x0, le_prev = 0x0 }, p_orphans = { lh_first = 0x0 }, p_treeflag = 0, p_reaper = 0xfffff800029724e8, p_reaplist = { lh_first = 0x0 }, p_reapsibling = { le_next = 0xfffff800bb6829d0, le_prev = 0xfffff800bb819ea0 }, p_reapsubtree = 16 } (kgdb) p *(struct thread *)0xfffff800bb963940 $6 = { td_lock = 0xffffffff815cb380, td_proc = 0xfffff800bb819000, td_plist = { tqe_next = 0x0, tqe_prev = 0xfffff800bb819010 }, td_runq = { tqe_next = 0x0, tqe_prev = 0xffffffff815cb5c8 }, td_slpq = { tqe_next = 0x0, tqe_prev = 0xfffff800027d0a00 }, td_lockq = { tqe_next = 0x0, tqe_prev = 0xfffffe0122ba3428 }, td_hash = { le_next = 0x0, le_prev = 0xfffffe0000866068 }, td_cpuset = 0xfffff8000296c3a8, td_sel = 0xfffff80013530f00, td_sleepqueue = 0xfffff800027d0a00, td_turnstile = 0xfffff800bbbbdc00, td_rlqe = 0xfffff800134bad20, td_umtxq = 0xfffff800bb2c6480, td_tid = 100365, td_sigqueue = { sq_signals = { __bits = {0, 0, 0, 0} }, sq_kill = { __bits = {0, 0, 0, 0} }, sq_list = { tqh_first = 0x0, tqh_last = 0xfffff800bb9639f8 }, sq_proc = 0xfffff800bb819000, sq_flags = 1 }, td_lend_user_pri = 255 'ÿ', td_flags = 20, td_inhibitors = 2, td_pflags = 0, td_dupfd = 0, td_sqqueue = 0, td_wchan = 0x0, td_wmesg = 0x0, td_lastcpu = 0 '\0', td_oncpu = 255 'ÿ', td_owepreempt = 0 '\0', td_tsqueue = 0 '\0', td_locks = 29149, td_rw_rlocks = 0, td_lk_slocks = 0, td_stopsched = 0, td_blocked = 0x0, td_lockname = 0x0, td_contested = { lh_first = 0x0 }, td_sleeplocks = 0x0, td_intr_nesting_level = 0, td_pinned = 0, td_ucred = 0xfffff800bb55f200, td_estcpu = 0, td_slptick = 0, td_blktick = 0, td_swvoltick = 1092393710, td_cow = 0, td_ru = { ru_utime = { tv_sec = 0, tv_usec = 0 }, ru_stime = { tv_sec = 0, tv_usec = 0 }, ru_maxrss = 218024, ru_ixrss = 4102525196, ru_idrss = 256714706556, ru_isrss = 1033707136, ru_minflt = 768721, ru_majflt = 3262, ru_nswap = 0, ru_inblock = 573, ru_oublock = 1, ru_msgsnd = 395465017, ru_msgrcv = 178667178, ru_nsignals = 0, ru_nvcsw = 213744212, ru_nivcsw = 56200635 }, td_rux = { rux_runtime = 109968287043644, rux_uticks = 1450275, rux_sticks = 6625562, rux_iticks = 0, rux_uu = 11625969653, rux_su = 53113087346, rux_tu = 64739057000 }, td_incruntime = 0, td_runtime = 109968287043644, td_pticks = 0, td_sticks = 0, td_iticks = 0, td_uticks = 0, td_intrval = 0, td_oldsigmask = { __bits = {0, 0, 0, 0} }, td_generation = 269944847, td_sigstk = { ss_sp = 0x0, ss_size = 0, ss_flags = 4 }, td_xsig = 0, td_profil_addr = 0, td_profil_ticks = 0, td_name = "nm-poller\000\000\000\000\000\000\000\000\000\000", td_fpop = 0x0, td_dbgflags = 0, td_dbgksi = { ksi_link = { tqe_next = 0x0, tqe_prev = 0x0 }, ksi_info = { si_signo = 0, si_errno = 0, si_code = 0, si_pid = 0, si_uid = 0, si_status = 0, si_addr = 0x0, si_value = { sival_int = 0, sival_ptr = 0x0, sigval_int = 0, sigval_ptr = 0x0 }, _reason = { _fault = { _trapno = 0 }, _timer = { _timerid = 0, _overrun = 0 }, _mesgq = { _mqd = 0 }, _poll = { _band = 0 }, __spare__ = { __spare1__ = 0, __spare2__ = {0, 0, 0, 0, 0, 0, 0} } } }, ksi_flags = 0, ksi_sigq = 0x0 }, td_ng_outbound = 0, td_osd = { osd_nslots = 0, osd_slots = 0x0, osd_next = { le_next = 0x0, le_prev = 0x0 } }, td_map_def_user = 0x0, td_dbg_forked = 0, td_vp_reserv = 0, td_no_sleeping = 0, td_dom_rr_idx = 0, td_sigmask = { __bits = {1611157507, 0, 0, 0} }, td_rqindex = 26 '\032', td_base_pri = 104 'h', td_priority = 104 'h', td_pri_class = 3 '\003', td_user_pri = 124 '|', td_base_user_pri = 124 '|', td_pcb = 0xfffffe0122abdcc0, td_state = TDS_INHIBITED, td_retval = {0, 0}, td_slpcallout = { c_links = { le = { le_next = 0x0, le_prev = 0xfffffe000087d5b8 }, sle = { sle_next = 0x0 }, tqe = { tqe_next = 0x0, tqe_prev = 0xfffffe000087d5b8 } }, c_time = 4691782411130129, c_precision = 407753, c_arg = 0xfffff800bb963940, c_func = 0xffffffff809902b0 <sleepq_timeout>, c_lock = 0x0, c_flags = 0, c_iflags = 272, c_cpu = 1 }, td_frame = 0xfffffe0122abdc00, td_kstack_obj = 0xfffff800bb837700, td_kstack = 18446741879562936320, td_kstack_pages = 4, td_critnest = 1, td_md = { md_spinlock_count = 1, md_saved_flags = 582, md_spurflt_addr = 0 }, td_sched = 0xfffff800bb963db0, td_ar = 0x0, td_lprof = {{ lh_first = 0x0 }, { lh_first = 0x0 }}, td_dtrace = 0xfffff80013b46700, td_errno = 0, td_vnet = 0x0, td_vnet_lpush = 0x0, td_intr_frame = 0x0, td_rfppwait_p = 0x0, td_ma = 0x0, td_ma_cnt = 0, td_su = 0x0 } Paul. -- Paul Koch | Founder, CEO AKIPS Network Monitor | akips.com Brisbane, Australia _______________________________________________ freebsd-stable@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/freebsd-stable To unsubscribe, send any mail to "freebsd-stable-unsubscr...@freebsd.org"