On Wed, Feb 24, 2016 at 02:26:19PM +1000, Paul Koch wrote:
> 
> Occasionally we see a process get stuck in an unkillable state and
> the only solution is a hard reboot.
> 
> Occasionally == once every two weeks across 60+ servers, which are spread
> across the globe in customer sites.  We have no remote access to these boxes.
> 
> The process that most often that gets stuck, but not limited to, is a large
> scale Ping/SNMP poller.  It is a fairly simplistic C program that just fires
> out lots of ping (raw ICMP socket) and SNMP (UDP socket) requests
> asynchronously.
> 
> We've managed to trap the problem a few times on a test server running in
> VirtualBox, but it also occurs on customer sites who run VMware, Hyper-V,
> QEMU and on bare metal.
> 
> 
> We raise this PR
>  https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=204081
> 
> but suspect it is a similar/same issue as
>  https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=200992
> 
> This is the info we've gathered from the most recent time it has occurred:
> 
> 
> # uname -a
> FreeBSD shed153.akips.com 10.2-RELEASE-p12 FreeBSD 10.2-RELEASE-p12 #0 
> r295070:
> Sat Jan 30 20:03:44 UTC 2016  
> r...@shed21.akips.com:/usr/obj/usr/src/sys/GENERIC amd64

> # ps auxww | grep nm-poller
> akips    1014   0.0  2.6 871820 106540  -  Ds   10Feb16  1078:59.06 nm-poller
> 
> 
> # procstat -k 1014 
>   PID    TID COMM       TDNAME   KSTACK                       
>  1014 100365 nm-poller  -        mi_switch sleepq_timedwait_sig 
> _cv_timedwait_sig_sbt seltdwait kern_select sys_select amd64_syscall 
> Xfast_syscall 
> 

Yes, on HEAD it was reported that the https://reviews.freebsd.org/D5221
fixed the problem.  Still not reviewed.

I did back-port to stable/10, the patch below is probably not applicable
to 10.2, you would need 10.3 for it.  Some revisions are missed from
stable/10, but I think that the issue worked around in the patch is at
the core of troubles many people reported.

Index: sys/kern/kern_timeout.c
===================================================================
--- sys/kern/kern_timeout.c     (revision 295966)
+++ sys/kern/kern_timeout.c     (working copy)
@@ -1127,7 +1127,7 @@ _callout_stop_safe(c, safe)
         * Some old subsystems don't hold Giant while running a callout_stop(),
         * so just discard this check for the moment.
         */
-       if (!safe && c->c_lock != NULL) {
+       if ((safe & CS_DRAIN) == 0 && c->c_lock != NULL) {
                if (c->c_lock == &Giant.lock_object)
                        use_lock = mtx_owned(&Giant);
                else {
@@ -1207,7 +1207,7 @@ again:
                        return (0);
                }
 
-               if (safe) {
+               if ((safe & CS_DRAIN) != 0) {
                        /*
                         * The current callout is running (or just
                         * about to run) and blocking is allowed, so
@@ -1319,7 +1319,7 @@ again:
                        CTR3(KTR_CALLOUT, "postponing stop %p func %p arg %p",
                            c, c->c_func, c->c_arg);
                        CC_UNLOCK(cc);
-                       return (0);
+                       return ((safe & CS_MIGRBLOCK) != 0);
                }
                CTR3(KTR_CALLOUT, "failed to stop %p func %p arg %p",
                    c, c->c_func, c->c_arg);
Index: sys/kern/subr_sleepqueue.c
===================================================================
--- sys/kern/subr_sleepqueue.c  (revision 295966)
+++ sys/kern/subr_sleepqueue.c  (working copy)
@@ -572,7 +572,8 @@ sleepq_check_timeout(void)
         * another CPU, so synchronize with it to avoid having it
         * accidentally wake up a subsequent sleep.
         */
-       else if (callout_stop(&td->td_slpcallout) == 0) {
+       else if (_callout_stop_safe(&td->td_slpcallout, CS_MIGRBLOCK)
+           == 0) {
                td->td_flags |= TDF_TIMEOUT;
                TD_SET_SLEEPING(td);
                mi_switch(SW_INVOL | SWT_SLEEPQTIMO, NULL);
Index: sys/sys/callout.h
===================================================================
--- sys/sys/callout.h   (revision 295966)
+++ sys/sys/callout.h   (working copy)
@@ -62,6 +62,9 @@ struct callout_handle {
        struct callout *callout;
 };
 
+#define        CS_DRAIN                0x0001
+#define        CS_MIGRBLOCK            0x0002
+
 #ifdef _KERNEL
 /* 
  * Note the flags field is actually *two* fields. The c_flags
@@ -81,7 +84,7 @@ struct callout_handle {
  */
 #define        callout_active(c)       ((c)->c_flags & CALLOUT_ACTIVE)
 #define        callout_deactivate(c)   ((c)->c_flags &= ~CALLOUT_ACTIVE)
-#define        callout_drain(c)        _callout_stop_safe(c, 1)
+#define        callout_drain(c)        _callout_stop_safe(c, CS_DRAIN)
 void   callout_init(struct callout *, int);
 void   _callout_init_lock(struct callout *, struct lock_object *, int);
 #define        callout_init_mtx(c, mtx, flags)                                 
\

_______________________________________________
freebsd-stable@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/freebsd-stable
To unsubscribe, send any mail to "freebsd-stable-unsubscr...@freebsd.org"

Reply via email to