Re: Frequent wake-ups for mpm_event

Yann Ylavic Fri, 23 Sep 2016 07:54:13 -0700

With the patch this time...


On Fri, Sep 23, 2016 at 4:53 PM, Yann Ylavic <[email protected]> wrote:
> On Fri, Sep 23, 2016 at 3:13 PM, Yann Ylavic <[email protected]> wrote:
>>
>> If you can try another (additive) patch on the faulty server, it would
>> be awesome to test the one from [1], not sure it applies correctly
>> with my 2.4.x patch though, possibly better with the trunk version
>> over [1] (let me know otherwise).
>
> Not a big deal actually, the attached patch includes both [1] and [2] 
> combined.
>
> Regards,
> Yann.
>
> [1] https://bz.apache.org/bugzilla/show_bug.cgi?id=53555#c55
> [2] https://bz.apache.org/bugzilla/show_bug.cgi?id=57399#c9

Index: modules/generators/mod_status.c
===================================================================
--- modules/generators/mod_status.c	(revision 1761356)
+++ modules/generators/mod_status.c	(working copy)
@@ -531,7 +531,7 @@ static int status_handler(request_rec *r)
 
     if (is_async) {
         int write_completion = 0, lingering_close = 0, keep_alive = 0,
-            connections = 0;
+            connections = 0, stopping = 0, procs = 0;
         /*
          * These differ from 'busy' and 'ready' in how gracefully finishing
          * threads are counted. XXX: How to make this clear in the html?
@@ -539,13 +539,15 @@ static int status_handler(request_rec *r)
         int busy_workers = 0, idle_workers = 0;
         if (!short_report)
             ap_rputs("\n\n<table rules=\"all\" cellpadding=\"1%\">\n"
-                     "<tr><th rowspan=\"2\">PID</th>"
+                     "<tr><th rowspan=\"2\">Slot</th>"
+                         "<th rowspan=\"2\">PID</th>"
+                         "<th rowspan=\"2\">Stopping</th>"
                          "<th colspan=\"2\">Connections</th>\n"
                          "<th colspan=\"2\">Threads</th>"
-                         "<th colspan=\"4\">Async connections</th></tr>\n"
+                         "<th colspan=\"3\">Async connections</th></tr>\n"
                      "<tr><th>total</th><th>accepting</th>"
-                         "<th>busy</th><th>idle</th><th>writing</th>"
-                         "<th>keep-alive</th><th>closing</th></tr>\n", r);
+                         "<th>busy</th><th>idle</th>"
+                         "<th>writing</th><th>keep-alive</th><th>closing</th></tr>\n", r);
         for (i = 0; i < server_limit; ++i) {
             ps_record = ap_get_scoreboard_process(i);
             if (ps_record->pid) {
@@ -555,26 +557,45 @@ static int status_handler(request_rec *r)
                 lingering_close  += ps_record->lingering_close;
                 busy_workers     += thread_busy_buffer[i];
                 idle_workers     += thread_idle_buffer[i];
-                if (!short_report)
-                    ap_rprintf(r, "<tr><td>%" APR_PID_T_FMT "</td><td>%u</td>"
-                                      "<td>%s</td><td>%u</td><td>%u</td>"
+                if (!short_report) {
+                    const char *dying = "no";
+                    const char *old = "";
+                    if (ps_record->quiescing) {
+                        dying = "yes";
+                        stopping++;
+                    }
+                    if (ps_record->generation != mpm_generation)
+                        old = " (old gen)";
+                    procs++;
+                    ap_rprintf(r, "<tr><td>%u</td><td>%" APR_PID_T_FMT "</td>"
+                                      "<td>%s%s</td>"
+                                      "<td>%u</td><td>%s</td>"
+                                      "<td>%u</td><td>%u</td>"
                                       "<td>%u</td><td>%u</td><td>%u</td>"
                                       "</tr>\n",
-                               ps_record->pid, ps_record->connections,
+                               i, ps_record->pid,
+                               dying, old,
+                               ps_record->connections,
                                ps_record->not_accepting ? "no" : "yes",
-                               thread_busy_buffer[i], thread_idle_buffer[i],
+                               thread_busy_buffer[i],
+                               thread_idle_buffer[i],
                                ps_record->write_completion,
                                ps_record->keep_alive,
                                ps_record->lingering_close);
+                }
             }
         }
         if (!short_report) {
-            ap_rprintf(r, "<tr><td>Sum</td><td>%d</td><td>&nbsp;</td><td>%d</td>"
-                          "<td>%d</td><td>%d</td><td>%d</td><td>%d</td>"
+            ap_rprintf(r, "<tr><td>Sum</td>"
+                          "<td>%d</td><td>%d</td>"
+                          "<td>%d</td><td>&nbsp;</td>"
+                          "<td>%d</td><td>%d</td>"
+                          "<td>%d</td><td>%d</td><td>%d</td>"
                           "</tr>\n</table>\n",
-                          connections, busy_workers, idle_workers,
+                          procs, stopping,
+                          connections,
+                          busy_workers, idle_workers,
                           write_completion, keep_alive, lingering_close);
-
         }
         else {
             ap_rprintf(r, "ConnsTotal: %d\n"
Index: server/mpm/event/event.c
===================================================================
--- server/mpm/event/event.c	(revision 1761356)
+++ server/mpm/event/event.c	(working copy)
@@ -100,6 +100,8 @@
 #include <limits.h>             /* for INT_MAX */
 
 
+#define VOLATILE_READ(T, x) (*(volatile T *)&(x))
+
 /* Limit on the total --- clients will be locked out if more servers than
  * this are needed.  It is intended solely to keep the server from crashing
  * when things get out of hand.
@@ -160,20 +162,24 @@
 #endif
 #define WORKER_FACTOR_SCALE   16  /* scale factor to allow fractional values */
 static unsigned int worker_factor = DEFAULT_WORKER_FACTOR * WORKER_FACTOR_SCALE;
+    /* AsyncRequestWorkerFactor * 16 */
 
-static int threads_per_child = 0;   /* Worker threads per child */
-static int ap_daemons_to_start = 0;
-static int min_spare_threads = 0;
-static int max_spare_threads = 0;
-static int ap_daemons_limit = 0;
-static int max_workers = 0;
-static int server_limit = 0;
-static int thread_limit = 0;
+static int threads_per_child = 0;           /* ThreadsPerChild */
+static int ap_daemons_to_start = 0;         /* StartServers */
+static int min_spare_threads = 0;           /* MinSpareThreads */
+static int max_spare_threads = 0;           /* MaxSpareThreads */
+static int active_daemons_limit = 0;        /* MaxRequestWorkers / ThreadsPerChild */
+static int active_daemons = 0;              /* workers that still active, i.e. are
+                                               not shutting down gracefully */
+static int max_workers = 0;                 /* MaxRequestWorkers */
+static int server_limit = 0;                /* ServerLimit */
+static int thread_limit = 0;                /* ThreadLimit */
 static int had_healthy_child = 0;
 static int dying = 0;
 static int workers_may_exit = 0;
 static int start_thread_may_exit = 0;
 static int listener_may_exit = 0;
+static int listener_is_wakeable = 0;        /* Pollset supports APR_POLLSET_WAKEABLE */
 static int num_listensocks = 0;
 static apr_int32_t conns_this_child;        /* MaxConnectionsPerChild, only access
                                                in listener thread */
@@ -181,6 +187,8 @@ static apr_uint32_t connection_count = 0;   /* Num
 static apr_uint32_t lingering_count = 0;    /* Number of connections in lingering close */
 static apr_uint32_t suspended_count = 0;    /* Number of suspended connections */
 static apr_uint32_t clogged_count = 0;      /* Number of threads processing ssl conns */
+static apr_uint32_t threads_shutdown = 0;   /* Number of threads that have shutdown
+                                               early during graceful termination */
 static int resource_shortage = 0;
 static fd_queue_t *worker_queue;
 static fd_queue_info_t *worker_queue_info;
@@ -194,6 +202,17 @@ module AP_MODULE_DECLARE_DATA mpm_event_module;
 struct event_srv_cfg_s;
 typedef struct event_srv_cfg_s event_srv_cfg;
 
+/*
+ * The pollset for sockets that are in any of the timeout queues. Currently
+ * we use the timeout_mutex to make sure that connections are added/removed
+ * atomically to/from both event_pollset and a timeout queue. Otherwise
+ * some confusion can happen under high load if timeout queues and pollset
+ * get out of sync.
+ * XXX: It should be possible to make the lock unnecessary in many or even all
+ * XXX: cases.
+ */
+static apr_pollset_t *event_pollset;
+
 struct event_conn_state_t {
     /** APR_RING of expiration timeouts */
     APR_RING_ENTRY(event_conn_state_t) timeout_list;
@@ -239,24 +258,52 @@ static struct timeout_queue *write_completion_q,
                             *keepalive_q,
                             *linger_q,
                             *short_linger_q;
+static apr_time_t queues_next_expiry;
 
 static apr_pollfd_t *listener_pollfd;
 
+/* Prevent extra poll/wakeup calls for timeouts close in the future (queues
+ * have the granularity of a second anyway).
+ * XXX: Wouldn't 0.5s (instead of 0.1s) be "enough"?
+ */
+#define TIMEOUT_FUDGE_FACTOR APR_TIME_C(100000) /* 100 ms */
+
+/* Same goal as for TIMEOUT_FUDGE_FACTOR (avoid extra poll calls), but applied
+ * to timers. Since their timeouts are custom (user defined), we can't be too
+ * approximative here (hence using 0.01s).
+ */
+#define EVENT_FUDGE_FACTOR APR_TIME_C(10000) /* 10 ms */
+
 /*
  * Macros for accessing struct timeout_queue.
  * For TO_QUEUE_APPEND and TO_QUEUE_REMOVE, timeout_mutex must be held.
  */
-#define TO_QUEUE_APPEND(q, el)                                                \
-    do {                                                                      \
-        APR_RING_INSERT_TAIL(&(q)->head, el, event_conn_state_t,              \
-                             timeout_list);                                   \
-        ++*(q)->total;                                                        \
-        ++(q)->count;                                                         \
-    } while (0)
+static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *el)
+{
+    apr_time_t q_expiry;
 
+    APR_RING_INSERT_TAIL(&q->head, el, event_conn_state_t, timeout_list);
+    ++*q->total;
+    ++q->count;
+
+    /* Cheaply update the overall queues' next expiry according to the
+     * first entry of this queue (oldest), if necessary.
+     */
+    el = APR_RING_FIRST(&q->head);
+    q_expiry = el->queue_timestamp + q->timeout;
+    if (!queues_next_expiry
+            || queues_next_expiry > q_expiry + TIMEOUT_FUDGE_FACTOR) {
+        VOLATILE_READ(apr_time_t, queues_next_expiry) = q_expiry;
+        /* Unblock the poll()ing listener for it to update its timeout. */
+        if (listener_is_wakeable) {
+            apr_pollset_wakeup(event_pollset);
+        }
+    }
+}
+
 #define TO_QUEUE_REMOVE(q, el)                                                \
     do {                                                                      \
-        APR_RING_REMOVE(el, timeout_list);                                    \
+        APR_RING_REMOVE((el), timeout_list);                                  \
         --*(q)->total;                                                        \
         --(q)->count;                                                         \
     } while (0)
@@ -272,25 +319,14 @@ static apr_pollfd_t *listener_pollfd;
         (q)->next = NULL;                                                     \
     } while (0)
 
-#define TO_QUEUE_ELEM_INIT(el) APR_RING_ELEM_INIT(el, timeout_list)
+#define TO_QUEUE_ELEM_INIT(el) \
+    APR_RING_ELEM_INIT(el, timeout_list)
 
-/*
- * The pollset for sockets that are in any of the timeout queues. Currently
- * we use the timeout_mutex to make sure that connections are added/removed
- * atomically to/from both event_pollset and a timeout queue. Otherwise
- * some confusion can happen under high load if timeout queues and pollset
- * get out of sync.
- * XXX: It should be possible to make the lock unnecessary in many or even all
- * XXX: cases.
- */
-static apr_pollset_t *event_pollset;
-
 /* The structure used to pass unique initialization info to each thread */
 typedef struct
 {
-    int pid;
-    int tid;
-    int sd;
+    int pslot;  /* process slot */
+    int tslot;  /* worker slot of the thread */
 } proc_info;
 
 /* Structure used to pass information to the thread responsible for
@@ -335,7 +371,15 @@ typedef struct event_retained_data {
      * scoreboard.
      */
     int max_daemons_limit;
+
     /*
+     * All running workers, active and shutting down, including those that
+     * may be left from before a graceful restart.
+     * Not kept up-to-date when shutdown is pending.
+     */
+    int total_daemons;
+
+    /*
      * idle_spawn_rate is the number of children that will be spawned on the
      * next maintenance cycle if there aren't enough idle servers.  It is
      * maintained per listeners bucket, doubled up to MAX_SPAWN_RATE, and
@@ -462,6 +506,11 @@ static void wakeup_listener(void)
         return;
     }
 
+    /* Unblock the listener if it's poll()ing */
+    if (listener_is_wakeable) {
+        apr_pollset_wakeup(event_pollset);
+    }
+
     /* unblock the listener if it's waiting for a worker */
     ap_queue_info_term(worker_queue_info);
 
@@ -548,7 +597,7 @@ static int event_query(int query_code, int *result
         *result = ap_max_requests_per_child;
         break;
     case AP_MPMQ_MAX_DAEMONS:
-        *result = ap_daemons_limit;
+        *result = active_daemons_limit;
         break;
     case AP_MPMQ_MPM_STATE:
         *result = mpm_state;
@@ -585,27 +634,6 @@ static void event_note_child_started(int slot, pid
                         retained->my_generation, slot, MPM_CHILD_STARTED);
 }
 
-static void event_note_child_lost_slot(int slot, pid_t newpid)
-{
-    ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(00458)
-                 "pid %" APR_PID_T_FMT " taking over scoreboard slot from "
-                 "%" APR_PID_T_FMT "%s",
-                 newpid,
-                 ap_scoreboard_image->parent[slot].pid,
-                 ap_scoreboard_image->parent[slot].quiescing ?
-                 " (quiescing)" : "");
-    ap_run_child_status(ap_server_conf,
-                        ap_scoreboard_image->parent[slot].pid,
-                        ap_scoreboard_image->parent[slot].generation,
-                        slot, MPM_CHILD_LOST_SLOT);
-    /* Don't forget about this exiting child process, or we
-     * won't be able to kill it if it doesn't exit by the
-     * time the server is shut down.
-     */
-    ap_register_extra_mpm_process(ap_scoreboard_image->parent[slot].pid,
-                                  ap_scoreboard_image->parent[slot].generation);
-}
-
 static const char *event_get_name(void)
 {
     return "event";
@@ -656,7 +684,11 @@ static apr_status_t decrement_connection_count(voi
         default:
             break;
     }
-    apr_atomic_dec32(&connection_count);
+    /* Unblock the listener if it's waiting for connection_count = 0 */
+    if (!apr_atomic_dec32(&connection_count)
+             && listener_is_wakeable && listener_may_exit) {
+        apr_pollset_wakeup(event_pollset);
+    }
     return APR_SUCCESS;
 }
 
@@ -819,6 +851,7 @@ static void notify_resume(event_conn_state_t *cs,
 
 static int start_lingering_close_common(event_conn_state_t *cs, int in_worker)
 {
+    int done = 0;
     apr_status_t rv;
     struct timeout_queue *q;
     apr_socket_t *csd = cs->pfd.desc.s;
@@ -830,7 +863,6 @@ static int start_lingering_close_common(event_conn
 #else
     apr_socket_timeout_set(csd, 0);
 #endif
-    cs->queue_timestamp = apr_time_now();
     /*
      * If some module requested a shortened waiting period, only wait for
      * 2s (SECONDS_TO_LINGER). This is useful for mitigating certain
@@ -851,25 +883,25 @@ static int start_lingering_close_common(event_conn
     else {
         cs->c->sbh = NULL;
     }
-    apr_thread_mutex_lock(timeout_mutex);
-    TO_QUEUE_APPEND(q, cs);
     cs->pfd.reqevents = (
             cs->pub.sense == CONN_SENSE_WANT_WRITE ? APR_POLLOUT :
                     APR_POLLIN) | APR_POLLHUP | APR_POLLERR;
     cs->pub.sense = CONN_SENSE_DEFAULT;
+    cs->queue_timestamp = apr_time_now();
+    apr_thread_mutex_lock(timeout_mutex);
     rv = apr_pollset_add(event_pollset, &cs->pfd);
+    if (rv == APR_SUCCESS || APR_STATUS_IS_EEXIST(rv)) {
+        TO_QUEUE_APPEND(q, cs);
+        done = 1;
+    }
     apr_thread_mutex_unlock(timeout_mutex);
-    if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) {
+    if (!done) {
         ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03092)
                      "start_lingering_close: apr_pollset_add failure");
-        apr_thread_mutex_lock(timeout_mutex);
-        TO_QUEUE_REMOVE(q, cs);
-        apr_thread_mutex_unlock(timeout_mutex);
         apr_socket_close(cs->pfd.desc.s);
         ap_push_pool(worker_queue_info, cs->p);
-        return 0;
     }
-    return 1;
+    return done;
 }
 
 /*
@@ -911,6 +943,8 @@ static int start_lingering_close_nonblocking(event
         || apr_socket_shutdown(csd, APR_SHUTDOWN_WRITE) != APR_SUCCESS) {
         apr_socket_close(csd);
         ap_push_pool(worker_queue_info, cs->p);
+        if (dying)
+            ap_queue_interrupt_one(worker_queue);
         return 0;
     }
     return start_lingering_close_common(cs, 0);
@@ -934,6 +968,8 @@ static int stop_lingering_close(event_conn_state_t
         AP_DEBUG_ASSERT(0);
     }
     ap_push_pool(worker_queue_info, cs->p);
+    if (dying)
+        ap_queue_interrupt_one(worker_queue);
     return 0;
 }
 
@@ -1129,15 +1165,15 @@ read_request:
              * Set a write timeout for this connection, and let the
              * event thread poll for writeability.
              */
-            cs->queue_timestamp = apr_time_now();
             notify_suspend(cs);
-            apr_thread_mutex_lock(timeout_mutex);
-            TO_QUEUE_APPEND(cs->sc->wc_q, cs);
             cs->pfd.reqevents = (
                     cs->pub.sense == CONN_SENSE_WANT_READ ? APR_POLLIN :
                             APR_POLLOUT) | APR_POLLHUP | APR_POLLERR;
             cs->pub.sense = CONN_SENSE_DEFAULT;
-            rc = apr_pollset_add(event_pollset, &cs->pfd);
+            cs->queue_timestamp = apr_time_now();
+            apr_thread_mutex_lock(timeout_mutex);
+            apr_pollset_add(event_pollset, &cs->pfd);
+            TO_QUEUE_APPEND(cs->sc->wc_q, cs);
             apr_thread_mutex_unlock(timeout_mutex);
             return;
         }
@@ -1166,14 +1202,13 @@ read_request:
          * timeout today.  With a normal client, the socket will be readable in
          * a few milliseconds anyway.
          */
-        cs->queue_timestamp = apr_time_now();
         notify_suspend(cs);
-        apr_thread_mutex_lock(timeout_mutex);
-        TO_QUEUE_APPEND(cs->sc->ka_q, cs);
-
         /* Add work to pollset. */
         cs->pfd.reqevents = APR_POLLIN;
+        cs->queue_timestamp = apr_time_now();
+        apr_thread_mutex_lock(timeout_mutex);
         rc = apr_pollset_add(event_pollset, &cs->pfd);
+        TO_QUEUE_APPEND(cs->sc->ka_q, cs);
         apr_thread_mutex_unlock(timeout_mutex);
 
         if (rc != APR_SUCCESS) {
@@ -1219,6 +1254,9 @@ static void close_listeners(int process_slot, int
         }
         /* wake up the main thread */
         kill(ap_my_pid, SIGTERM);
+
+        ap_free_idle_pools(worker_queue_info);
+        ap_queue_interrupt_all(worker_queue);
     }
 }
 
@@ -1347,6 +1385,7 @@ static void get_worker(int *have_idle_worker_p, in
 static APR_RING_HEAD(timer_free_ring_t, timer_event_t) timer_free_ring;
 
 static apr_skiplist *timer_skiplist;
+static apr_time_t timers_next_expiry;
 
 /* The following compare function is used by apr_skiplist_insert() to keep the
  * elements (timers) sorted and provide O(log n) complexity (this is also true
@@ -1396,6 +1435,18 @@ static apr_status_t event_register_timed_callback(
     /* Okay, add sorted by when.. */
     apr_skiplist_insert(timer_skiplist, te);
 
+    /* Cheaply update the overall timers' next expiry according to
+     * this event, if necessary.
+     */
+    if (!timers_next_expiry
+            || timers_next_expiry > te->when + EVENT_FUDGE_FACTOR) {
+        VOLATILE_READ(apr_time_t, timers_next_expiry) = te->when;
+        /* Unblock the poll()ing listener for it to update its timeout. */
+        if (listener_is_wakeable) {
+            apr_pollset_wakeup(event_pollset);
+        }
+    }
+
     apr_thread_mutex_unlock(g_timer_skiplist_mtx);
 
     return APR_SUCCESS;
@@ -1439,6 +1490,8 @@ static void process_lingering_close(event_conn_sta
     TO_QUEUE_ELEM_INIT(cs);
 
     ap_push_pool(worker_queue_info, cs->p);
+    if (dying)
+        ap_queue_interrupt_one(worker_queue);
 }
 
 /* call 'func' for all elements of 'q' with timeout less than 'timeout_time'.
@@ -1464,20 +1517,31 @@ static void process_timeout_queue(struct timeout_q
         count = 0;
         cs = first = last = APR_RING_FIRST(&qp->head);
         while (cs != APR_RING_SENTINEL(&qp->head, event_conn_state_t,
-                                       timeout_list)
-               /* Trash the entry if:
-                * - no timeout_time was given (asked for all), or
-                * - it expired (according to the queue timeout), or
-                * - the system clock skewed in the past: no entry should be
-                *   registered above the given timeout_time (~now) + the queue
-                *   timeout, we won't keep any here (eg. for centuries).
-                * Stop otherwise, no following entry will match thanks to the
-                * single timeout per queue (entries are added to the end!).
-                * This allows maintenance in O(1).
-                */
-               && (!timeout_time
-                   || cs->queue_timestamp + qp->timeout < timeout_time
-                   || cs->queue_timestamp > timeout_time + qp->timeout)) {
+                                       timeout_list)) {
+            /* Trash the entry if:
+             * - no timeout_time was given (asked for all), or
+             * - it expired (according to the queue timeout), or
+             * - the system clock skewed in the past: no entry should be
+             *   registered above the given timeout_time (~now) + the queue
+             *   timeout, we won't keep any here (eg. for centuries).
+             *
+             * Otherwise stop, no following entry will match thanks to the
+             * single timeout per queue (entries are added to the end!).
+             * This allows maintenance in O(1).
+             */
+            if (timeout_time
+                    && cs->queue_timestamp + qp->timeout > timeout_time
+                    && cs->queue_timestamp < timeout_time + qp->timeout) {
+                /* Since this is the next expiring of this queue, update the
+                 * overall queues' next expiry if it's later than this one.
+                 */
+                apr_time_t q_expiry = cs->queue_timestamp + qp->timeout;
+                if (!queues_next_expiry || queues_next_expiry > q_expiry) {
+                    VOLATILE_READ(apr_time_t, queues_next_expiry) = q_expiry;
+                }
+                break;
+            }
+
             last = cs;
             rv = apr_pollset_remove(event_pollset, &cs->pfd);
             if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) {
@@ -1514,11 +1578,11 @@ static void process_timeout_queue(struct timeout_q
 
 static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
 {
-    timer_event_t *ep;
     timer_event_t *te;
     apr_status_t rc;
     proc_info *ti = dummy;
-    int process_slot = ti->pid;
+    int process_slot = ti->pslot;
+    struct process_score *ps = ap_get_scoreboard_process(process_slot);
     apr_pool_t *tpool = apr_thread_pool_get(thd);
     void *csd = NULL;
     apr_pool_t *ptrans;         /* Pool for per-transaction stuff */
@@ -1527,21 +1591,13 @@ static void * APR_THREAD_FUNC listener_thread(apr_
     const apr_pollfd_t *out_pfd;
     apr_int32_t num = 0;
     apr_interval_time_t timeout_interval;
-    apr_time_t timeout_time = 0, now, last_log;
     listener_poll_type *pt;
     int closed = 0, listeners_disabled = 0;
+    apr_time_t last_log;
 
     last_log = apr_time_now();
     free(ti);
 
-    /* the following times out events that are really close in the future
-     *   to prevent extra poll calls
-     *
-     * current value is .1 second
-     */
-#define TIMEOUT_FUDGE_FACTOR 100000
-#define EVENT_FUDGE_FACTOR 10000
-
     rc = init_pollset(tpool);
     if (rc != APR_SUCCESS) {
         ap_log_error(APLOG_MARK, APLOG_ERR, rc, ap_server_conf,
@@ -1559,6 +1615,9 @@ static void * APR_THREAD_FUNC listener_thread(apr_
 
     for (;;) {
         int workers_were_busy = 0;
+        apr_time_t now, timeout_time;
+        int keepalives;
+
         if (listener_may_exit) {
             close_listeners(process_slot, &closed);
             if (terminate_mode == ST_UNGRACEFUL
@@ -1569,7 +1628,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_
         if (conns_this_child <= 0)
             check_infinite_requests();
 
+        /* Update poll() timeout below according to the next expiring
+         * timer or queue entry, if any.
+         */
+        timeout_interval = -1;
         now = apr_time_now();
+
         if (APLOGtrace6(ap_server_conf)) {
             /* trace log status every second */
             if (now - last_log > apr_time_from_msec(1000)) {
@@ -1584,28 +1648,71 @@ static void * APR_THREAD_FUNC listener_thread(apr_
                              *keepalive_q->total,
                              apr_atomic_read32(&lingering_count),
                              apr_atomic_read32(&suspended_count));
+                if (dying) {
+                    ap_log_error(APLOG_MARK, APLOG_TRACE6, 0, ap_server_conf,
+                                 "%u/%u workers shutdown",
+                                 apr_atomic_read32(&threads_shutdown),
+                                 threads_per_child);
+                }
                 apr_thread_mutex_unlock(timeout_mutex);
             }
         }
 
-        apr_thread_mutex_lock(g_timer_skiplist_mtx);
-        te = apr_skiplist_peek(timer_skiplist);
-        if (te) {
-            if (te->when > now) {
-                timeout_interval = te->when - now;
+        /* Avoid locking if there's no expiring timer in the list,
+         * poll() will be woken up anyway if a new timer comes in.
+         */
+        timeout_time = VOLATILE_READ(apr_time_t, timers_next_expiry);
+        if (timeout_time && timeout_time < now + EVENT_FUDGE_FACTOR) {
+            /* Push expired timers to a worker, the first one remaining
+             * determines the maximum time to poll() below.
+             */
+            apr_thread_mutex_lock(g_timer_skiplist_mtx);
+            while ((te = apr_skiplist_peek(timer_skiplist))) {
+                if (te->when < now + EVENT_FUDGE_FACTOR) {
+                    apr_skiplist_pop(timer_skiplist, NULL);
+                    push_timer2worker(te);
+                 }
+                 else {
+                    timeout_interval = te->when - now;
+                    timers_next_expiry = te->when;
+                    break;
+                }
             }
-            else {
-                timeout_interval = 1;
+            /* If there are no timers in the list, either the listener is
+             * wakeable and it can poll() indefinitely until a wake up occurs,
+             * or periodic checks must be performed.
+             */
+            if (!te) {
+                if (!listener_is_wakeable) {
+                    timeout_interval = apr_time_from_msec(100);
+                }
+                timers_next_expiry = 0;
             }
+            apr_thread_mutex_unlock(g_timer_skiplist_mtx);
         }
-        else {
+
+        /* Same for queues, if the listener is wakeable use the current expiry
+         * time and expect to be woken up for an earlier one, otherwise use the
+         * maintenance timeout (max).
+         */
+        timeout_time = VOLATILE_READ(apr_time_t, queues_next_expiry);
+        if (timeout_time
+                && (timeout_interval < 0
+                    || timeout_time <= now
+                    || timeout_interval > timeout_time - now)) {
+            timeout_interval = timeout_time > now ? timeout_time - now : 1;
+        }
+        if (!listener_is_wakeable
+                && timeout_interval > apr_time_from_msec(100)) {
             timeout_interval = apr_time_from_msec(100);
         }
-        apr_thread_mutex_unlock(g_timer_skiplist_mtx);
 
         rc = apr_pollset_poll(event_pollset, timeout_interval, &num, &out_pfd);
         if (rc != APR_SUCCESS) {
             if (APR_STATUS_IS_EINTR(rc)) {
+                /* Woken up, either update timeouts or shutdown,
+                 * both logics are above.
+                 */
                 continue;
             }
             if (!APR_STATUS_IS_TIMEUP(rc)) {
@@ -1614,6 +1721,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_
                              "shutdown process gracefully");
                 signal_threads(ST_GRACEFUL);
             }
+            num = 0;
         }
 
         if (listener_may_exit) {
@@ -1623,21 +1731,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_
                 break;
         }
 
-        now = apr_time_now();
-        apr_thread_mutex_lock(g_timer_skiplist_mtx);
-        ep = apr_skiplist_peek(timer_skiplist);
-        while (ep) {
-            if (ep->when < now + EVENT_FUDGE_FACTOR) {
-                apr_skiplist_pop(timer_skiplist, NULL);
-                push_timer2worker(ep);
-            }
-            else {
-                break;
-            }
-            ep = apr_skiplist_peek(timer_skiplist);
-        }
-        apr_thread_mutex_unlock(g_timer_skiplist_mtx);
-
         while (num) {
             pt = (listener_poll_type *) out_pfd->client_data;
             if (pt->type == PT_CSD) {
@@ -1803,42 +1896,35 @@ static void * APR_THREAD_FUNC listener_thread(apr_
          * r->request_time for new requests
          */
         now = apr_time_now();
-        /* We only do this once per 0.1s (TIMEOUT_FUDGE_FACTOR), or on a clock
-         * skew (if the system time is set back in the meantime, timeout_time
-         * will exceed now + TIMEOUT_FUDGE_FACTOR, can't happen otherwise).
+        /* We process the timeout queues here only when their overall next
+         * expiry (read once above) is over. This happens accurately since
+         * adding to the queues (in workers) can only decrease this expiry,
+         * while latest ones are only taken into account here (in listener)
+         * during queues' processing, with the lock held. This works both
+         * with and without wake-ability.
          */
-        if (now > timeout_time || now + TIMEOUT_FUDGE_FACTOR < timeout_time ) {
-            struct process_score *ps;
+        if (timeout_time && timeout_time < (now = apr_time_now())) {
             timeout_time = now + TIMEOUT_FUDGE_FACTOR;
 
             /* handle timed out sockets */
             apr_thread_mutex_lock(timeout_mutex);
 
+            /* Processing all the queues below will recompute this. */
+            queues_next_expiry = 0;
+
             /* Step 1: keepalive timeouts */
-            /* If all workers are busy, we kill older keep-alive connections so that they
-             * may connect to another process.
-             */
-            if (workers_were_busy && *keepalive_q->total) {
-                ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
-                             "All workers are busy, will close %d keep-alive "
-                             "connections",
-                             *keepalive_q->total);
-                process_timeout_queue(keepalive_q, 0,
-                                      start_lingering_close_nonblocking);
-            }
-            else {
-                process_timeout_queue(keepalive_q, timeout_time,
-                                      start_lingering_close_nonblocking);
-            }
+            process_timeout_queue(keepalive_q, timeout_time,
+                                  start_lingering_close_nonblocking);
             /* Step 2: write completion timeouts */
             process_timeout_queue(write_completion_q, timeout_time,
                                   start_lingering_close_nonblocking);
             /* Step 3: (normal) lingering close completion timeouts */
-            process_timeout_queue(linger_q, timeout_time, stop_lingering_close);
+            process_timeout_queue(linger_q, timeout_time,
+                                  stop_lingering_close);
             /* Step 4: (short) lingering close completion timeouts */
-            process_timeout_queue(short_linger_q, timeout_time, stop_lingering_close);
+            process_timeout_queue(short_linger_q, timeout_time,
+                                  stop_lingering_close);
 
-            ps = ap_get_scoreboard_process(process_slot);
             ps->write_completion = *write_completion_q->total;
             ps->keep_alive = *keepalive_q->total;
             apr_thread_mutex_unlock(timeout_mutex);
@@ -1847,6 +1933,22 @@ static void * APR_THREAD_FUNC listener_thread(apr_
             ps->suspended = apr_atomic_read32(&suspended_count);
             ps->lingering_close = apr_atomic_read32(&lingering_count);
         }
+        else if ((workers_were_busy || dying)
+                 && (keepalives = VOLATILE_READ(int, *keepalive_q->total))) {
+            /* If all workers are busy, we kill older keep-alive connections so
+             * that they may connect to another process.
+             */
+            ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
+                         "All workers are %s, will close %d keep-alive "
+                         "connections", dying ? "dying" : "busy",
+                         keepalives);
+            apr_thread_mutex_lock(timeout_mutex);
+            process_timeout_queue(keepalive_q, 0,
+                                  start_lingering_close_nonblocking);
+            ps->keep_alive = 0;
+            apr_thread_mutex_unlock(timeout_mutex);
+        }
+
         if (listeners_disabled && !workers_were_busy
             && (int)apr_atomic_read32(&connection_count)
                - (int)apr_atomic_read32(&lingering_count)
@@ -1869,6 +1971,34 @@ static void * APR_THREAD_FUNC listener_thread(apr_
     return NULL;
 }
 
+/*
+ * During graceful shutdown, if there are more running worker threads than
+ * open connections, exit one worker thread.
+ *
+ * return 1 if thread should exit, 0 if it should continue running.
+ */
+static int worker_thread_should_exit_early(void)
+{
+    for (;;) {
+        apr_uint32_t conns = apr_atomic_read32(&connection_count);
+        apr_uint32_t dead = apr_atomic_read32(&threads_shutdown);
+        apr_uint32_t newdead;
+
+        AP_DEBUG_ASSERT(dead <= threads_per_child);
+        if (conns >= threads_per_child - dead)
+            return 0;
+
+        newdead = dead + 1;
+        if (apr_atomic_cas32(&threads_shutdown, newdead, dead) == dead) {
+            /*
+             * No other thread has exited in the mean time, safe to exit
+             * this one.
+             */
+            return 1;
+        }
+    }
+}
+
 /* XXX For ungraceful termination/restart, we definitely don't want to
  *     wait for active connections to finish but we may want to wait
  *     for idle workers to get out of the queue code and release mutexes,
@@ -1879,8 +2009,8 @@ static void * APR_THREAD_FUNC listener_thread(apr_
 static void *APR_THREAD_FUNC worker_thread(apr_thread_t * thd, void *dummy)
 {
     proc_info *ti = dummy;
-    int process_slot = ti->pid;
-    int thread_slot = ti->tid;
+    int process_slot = ti->pslot;
+    int thread_slot = ti->tslot;
     apr_socket_t *csd = NULL;
     event_conn_state_t *cs;
     apr_pool_t *ptrans;         /* Pool for per-transaction stuff */
@@ -1916,6 +2046,9 @@ static void *APR_THREAD_FUNC worker_thread(apr_thr
         if (workers_may_exit) {
             break;
         }
+        if (dying && worker_thread_should_exit_early()) {
+            break;
+        }
 
         te = NULL;
         rv = ap_queue_pop_something(worker_queue, &csd, &cs, &ptrans, &te);
@@ -1993,9 +2126,8 @@ static void create_listener_thread(thread_starter
     apr_status_t rv;
 
     my_info = (proc_info *) ap_malloc(sizeof(proc_info));
-    my_info->pid = my_child_num;
-    my_info->tid = -1;          /* listener thread doesn't have a thread slot */
-    my_info->sd = 0;
+    my_info->pslot = my_child_num;
+    my_info->tslot = -1;      /* listener thread doesn't have a thread slot */
     rv = apr_thread_create(&ts->listener, thread_attr, listener_thread,
                            my_info, pchild);
     if (rv != APR_SUCCESS) {
@@ -2027,6 +2159,8 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
     int prev_threads_created;
     int max_recycled_pools = -1;
     int good_methods[] = {APR_POLLSET_KQUEUE, APR_POLLSET_PORT, APR_POLLSET_EPOLL};
+    /* XXX don't we need more to handle K-A or lingering close? */
+    const apr_uint32_t pollset_size = threads_per_child * 2;
 
     /* We must create the fd queues before we start up the listener
      * and worker threads. */
@@ -2066,24 +2200,24 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
 
     /* Create the main pollset */
     for (i = 0; i < sizeof(good_methods) / sizeof(void*); i++) {
-        rv = apr_pollset_create_ex(&event_pollset,
-                            threads_per_child*2, /* XXX don't we need more, to handle
-                                                * connections in K-A or lingering
-                                                * close?
-                                                */
-                            pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY | APR_POLLSET_NODEFAULT,
-                            good_methods[i]);
+        apr_uint32_t flags = APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY |
+                             APR_POLLSET_NODEFAULT | APR_POLLSET_WAKEABLE;
+        rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags,
+                                   good_methods[i]);
         if (rv == APR_SUCCESS) {
+            listener_is_wakeable = 1;
             break;
         }
+        flags &= ~APR_POLLSET_WAKEABLE;
+        rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags,
+                                   good_methods[i]);
+        if (rv == APR_SUCCESS) {
+            break;
+        }
     }
     if (rv != APR_SUCCESS) {
-        rv = apr_pollset_create(&event_pollset,
-                               threads_per_child*2, /* XXX don't we need more, to handle
-                                                     * connections in K-A or lingering
-                                                     * close?
-                                                     */
-                               pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY);
+        rv = apr_pollset_create(&event_pollset, pollset_size, pchild,
+                                APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY);
     }
     if (rv != APR_SUCCESS) {
         ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03103)
@@ -2092,7 +2226,9 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
     }
 
     ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(02471)
-                 "start_threads: Using %s", apr_pollset_method_name(event_pollset));
+                 "start_threads: Using %s (%swakeable)",
+                 apr_pollset_method_name(event_pollset),
+                 listener_is_wakeable ? "" : "not ");
     worker_sockets = apr_pcalloc(pchild, threads_per_child
                                  * sizeof(apr_socket_t *));
 
@@ -2103,14 +2239,13 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
             int status =
                 ap_scoreboard_image->servers[my_child_num][i].status;
 
-            if (status != SERVER_GRACEFUL && status != SERVER_DEAD) {
+            if (status != SERVER_DEAD) {
                 continue;
             }
 
             my_info = (proc_info *) ap_malloc(sizeof(proc_info));
-            my_info->pid = my_child_num;
-            my_info->tid = i;
-            my_info->sd = 0;
+            my_info->pslot = my_child_num;
+            my_info->tslot = i;
 
             /* We are creating threads right now */
             ap_update_child_status_from_indexes(my_child_num, i,
@@ -2411,6 +2546,15 @@ static int make_child(server_rec * s, int slot, in
         retained->max_daemons_limit = slot + 1;
     }
 
+    if (ap_scoreboard_image->parent[slot].pid != 0) {
+        /* XXX replace with assert or remove ? */
+        ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, APLOGNO(03455)
+                 "BUG: Scoreboard slot %d should be empty but is "
+                 "in use by pid %" APR_PID_T_FMT,
+                 slot, ap_scoreboard_image->parent[slot].pid);
+        return -1;
+    }
+
     if (one_process) {
         my_bucket = &all_buckets[0];
 
@@ -2464,17 +2608,12 @@ static int make_child(server_rec * s, int slot, in
         return -1;
     }
 
-    if (ap_scoreboard_image->parent[slot].pid != 0) {
-        /* This new child process is squatting on the scoreboard
-         * entry owned by an exiting child process, which cannot
-         * exit until all active requests complete.
-         */
-        event_note_child_lost_slot(slot, pid);
-    }
     ap_scoreboard_image->parent[slot].quiescing = 0;
     ap_scoreboard_image->parent[slot].not_accepting = 0;
     ap_scoreboard_image->parent[slot].bucket = bucket;
     event_note_child_started(slot, pid);
+    active_daemons++;
+    retained->total_daemons++;
     return 0;
 }
 
@@ -2483,7 +2622,7 @@ static void startup_children(int number_to_start)
 {
     int i;
 
-    for (i = 0; number_to_start && i < ap_daemons_limit; ++i) {
+    for (i = 0; number_to_start && i < server_limit; ++i) {
         if (ap_scoreboard_image->parent[i].pid != 0) {
             continue;
         }
@@ -2497,34 +2636,22 @@ static void startup_children(int number_to_start)
 static void perform_idle_server_maintenance(int child_bucket, int num_buckets)
 {
     int i, j;
-    int idle_thread_count;
+    int idle_thread_count = 0;
     worker_score *ws;
     process_score *ps;
-    int free_length;
-    int totally_free_length = 0;
+    int free_length = 0;
     int free_slots[MAX_SPAWN_RATE];
-    int last_non_dead;
-    int total_non_dead;
+    int last_non_dead = -1;
     int active_thread_count = 0;
 
-    /* initialize the free_list */
-    free_length = 0;
-
-    idle_thread_count = 0;
-    last_non_dead = -1;
-    total_non_dead = 0;
-
-    for (i = 0; i < ap_daemons_limit; ++i) {
+    for (i = 0; i < server_limit; ++i) {
         /* Initialization to satisfy the compiler. It doesn't know
          * that threads_per_child is always > 0 */
         int status = SERVER_DEAD;
-        int any_dying_threads = 0;
-        int any_dead_threads = 0;
-        int all_dead_threads = 1;
         int child_threads_active = 0;
 
         if (i >= retained->max_daemons_limit &&
-            totally_free_length == retained->idle_spawn_rate[child_bucket]) {
+            free_length == retained->idle_spawn_rate[child_bucket]) {
             /* short cut if all active processes have been examined and
              * enough empty scoreboard slots have been found
              */
@@ -2532,25 +2659,17 @@ static void perform_idle_server_maintenance(int ch
             break;
         }
         ps = &ap_scoreboard_image->parent[i];
-        for (j = 0; j < threads_per_child; j++) {
-            ws = &ap_scoreboard_image->servers[i][j];
-            status = ws->status;
+        if (ps->pid != 0) {
+            for (j = 0; j < threads_per_child; j++) {
+                ws = &ap_scoreboard_image->servers[i][j];
+                status = ws->status;
 
-            /* XXX any_dying_threads is probably no longer needed    GLA */
-            any_dying_threads = any_dying_threads ||
-                (status == SERVER_GRACEFUL);
-            any_dead_threads = any_dead_threads || (status == SERVER_DEAD);
-            all_dead_threads = all_dead_threads &&
-                (status == SERVER_DEAD || status == SERVER_GRACEFUL);
-
-            /* We consider a starting server as idle because we started it
-             * at least a cycle ago, and if it still hasn't finished starting
-             * then we're just going to swamp things worse by forking more.
-             * So we hopefully won't need to fork more if we count it.
-             * This depends on the ordering of SERVER_READY and SERVER_STARTING.
-             */
-            if (ps->pid != 0) { /* XXX just set all_dead_threads in outer
-                                   for loop if no pid?  not much else matters */
+                /* We consider a starting server as idle because we started it
+                 * at least a cycle ago, and if it still hasn't finished starting
+                 * then we're just going to swamp things worse by forking more.
+                 * So we hopefully won't need to fork more if we count it.
+                 * This depends on the ordering of SERVER_READY and SERVER_STARTING.
+                 */
                 if (status <= SERVER_READY && !ps->quiescing && !ps->not_accepting
                     && ps->generation == retained->my_generation
                     && ps->bucket == child_bucket)
@@ -2561,39 +2680,13 @@ static void perform_idle_server_maintenance(int ch
                     ++child_threads_active;
                 }
             }
+            last_non_dead = i;
         }
         active_thread_count += child_threads_active;
-        if (any_dead_threads
-            && totally_free_length < retained->idle_spawn_rate[child_bucket]
-            && free_length < MAX_SPAWN_RATE / num_buckets
-            && (!ps->pid      /* no process in the slot */
-                  || ps->quiescing)) {  /* or at least one is going away */
-            if (all_dead_threads) {
-                /* great! we prefer these, because the new process can
-                 * start more threads sooner.  So prioritize this slot
-                 * by putting it ahead of any slots with active threads.
-                 *
-                 * first, make room by moving a slot that's potentially still
-                 * in use to the end of the array
-                 */
-                free_slots[free_length] = free_slots[totally_free_length];
-                free_slots[totally_free_length++] = i;
-            }
-            else {
-                /* slot is still in use - back of the bus
-                 */
-                free_slots[free_length] = i;
-            }
-            ++free_length;
-        }
-        else if (child_threads_active == threads_per_child) {
+        if (!ps->pid && free_length < retained->idle_spawn_rate[child_bucket])
+            free_slots[free_length++] = i;
+        else if (child_threads_active == threads_per_child)
             had_healthy_child = 1;
-        }
-        /* XXX if (!ps->quiescing)     is probably more reliable  GLA */
-        if (!any_dying_threads) {
-            last_non_dead = i;
-            ++total_non_dead;
-        }
     }
 
     if (retained->sick_child_detected) {
@@ -2621,32 +2714,56 @@ static void perform_idle_server_maintenance(int ch
 
     retained->max_daemons_limit = last_non_dead + 1;
 
-    if (idle_thread_count > max_spare_threads / num_buckets) {
-        /* Kill off one child */
-        ap_mpm_podx_signal(all_buckets[child_bucket].pod,
-                           AP_MPM_PODX_GRACEFUL);
-        retained->idle_spawn_rate[child_bucket] = 1;
+    if (idle_thread_count > max_spare_threads / num_buckets)
+    {
+        /*
+         * Child processes that we ask to shut down won't die immediately
+         * but may stay around for a long time when they finish their
+         * requests. If the server load changes many times, many such
+         * gracefully finishing processes may accumulate, filling up the
+         * scoreboard. To avoid running out of scoreboard entries, we
+         * don't shut down more processes when the total number of processes
+         * is high.
+         *
+         * XXX It would be nice if we could
+         * XXX - kill processes without keepalive connections first
+         * XXX - tell children to stop accepting new connections, and
+         * XXX   depending on server load, later be able to resurrect them
+         *       or kill them
+         */
+        if (retained->total_daemons <= active_daemons_limit &&
+            retained->total_daemons < server_limit) {
+            /* Kill off one child */
+            ap_mpm_podx_signal(all_buckets[child_bucket].pod,
+                               AP_MPM_PODX_GRACEFUL);
+            retained->idle_spawn_rate[child_bucket] = 1;
+            active_daemons--;
+        } else {
+            ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                         "Not shutting down child: total daemons %d / "
+                         "active limit %d / ServerLimit %d",
+                         retained->total_daemons, active_daemons_limit,
+                         server_limit);
+        }
     }
     else if (idle_thread_count < min_spare_threads / num_buckets) {
-        /* terminate the free list */
-        if (free_length == 0) { /* scoreboard is full, can't fork */
-
-            if (active_thread_count >= ap_daemons_limit * threads_per_child) {
-                if (!retained->maxclients_reported) {
-                    /* only report this condition once */
-                    ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, APLOGNO(00484)
-                                 "server reached MaxRequestWorkers setting, "
-                                 "consider raising the MaxRequestWorkers "
-                                 "setting");
-                    retained->maxclients_reported = 1;
-                }
+        if (active_thread_count >= max_workers) {
+            if (!retained->maxclients_reported) {
+                /* only report this condition once */
+                ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, APLOGNO(00484)
+                             "server reached MaxRequestWorkers setting, "
+                             "consider raising the MaxRequestWorkers "
+                             "setting");
+                retained->maxclients_reported = 1;
             }
-            else {
-                ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, APLOGNO(00485)
-                             "scoreboard is full, not at MaxRequestWorkers");
-            }
             retained->idle_spawn_rate[child_bucket] = 1;
         }
+        else if (free_length == 0) { /* scoreboard is full, can't fork */
+            ap_log_error(APLOG_MARK, APLOG_ERR, 0, ap_server_conf, APLOGNO()
+                         "scoreboard is full, not at MaxRequestWorkers."
+                         "Increase ServerLimit.");
+            retained->idle_spawn_rate[child_bucket] = 1;
+        }
         else {
             if (free_length > retained->idle_spawn_rate[child_bucket]) {
                 free_length = retained->idle_spawn_rate[child_bucket];
@@ -2657,10 +2774,17 @@ static void perform_idle_server_maintenance(int ch
                              "to increase StartServers, ThreadsPerChild "
                              "or Min/MaxSpareThreads), "
                              "spawning %d children, there are around %d idle "
-                             "threads, and %d total children", free_length,
-                             idle_thread_count, total_non_dead);
+                             "threads, %d active children, and %d children "
+                             "that are shutting down", free_length,
+                             idle_thread_count, active_daemons,
+                             retained->total_daemons);
             }
             for (i = 0; i < free_length; ++i) {
+                ap_log_error(APLOG_MARK, APLOG_TRACE5, 0, ap_server_conf,
+                             "Spawning new child: slot %d active / "
+                             "total daemons: %d/%d",
+                             free_slots[i], active_daemons,
+                             retained->total_daemons);
                 make_child(ap_server_conf, free_slots[i], child_bucket);
             }
             /* the next time around we want to spawn twice as many if this
@@ -2682,7 +2806,6 @@ static void perform_idle_server_maintenance(int ch
 
 static void server_main_loop(int remaining_children_to_start, int num_buckets)
 {
-    ap_generation_t old_gen;
     int child_slot;
     apr_exit_why_e exitwhy;
     int status, processed_status;
@@ -2732,13 +2855,15 @@ static void server_main_loop(int remaining_childre
 
                 event_note_child_killed(child_slot, 0, 0);
                 ps = &ap_scoreboard_image->parent[child_slot];
+                if (!ps->quiescing)
+                    active_daemons--;
                 ps->quiescing = 0;
+                retained->total_daemons--;
                 if (processed_status == APEXIT_CHILDSICK) {
                     /* resource shortage, minimize the fork rate */
                     retained->idle_spawn_rate[ps->bucket] = 1;
                 }
-                else if (remaining_children_to_start
-                         && child_slot < ap_daemons_limit) {
+                else if (remaining_children_to_start) {
                     /* we're still doing a 1-for-1 replacement of dead
                      * children with new children
                      */
@@ -2746,24 +2871,12 @@ static void server_main_loop(int remaining_childre
                     --remaining_children_to_start;
                 }
             }
-            else if (ap_unregister_extra_mpm_process(pid.pid, &old_gen) == 1) {
-
-                event_note_child_killed(-1, /* already out of the scoreboard */
-                                        pid.pid, old_gen);
-                if (processed_status == APEXIT_CHILDSICK
-                    && old_gen == retained->my_generation) {
-                    /* resource shortage, minimize the fork rate */
-                    for (i = 0; i < num_buckets; i++) {
-                        retained->idle_spawn_rate[i] = 1;
-                    }
-                }
 #if APR_HAS_OTHER_CHILD
-            }
             else if (apr_proc_other_child_alert(&pid, APR_OC_REASON_DEATH,
                                                 status) == 0) {
                 /* handled */
+            }
 #endif
-            }
             else if (retained->is_graceful) {
                 /* Great, we've probably just lost a slot in the
                  * scoreboard.  Somehow we don't know about this child.
@@ -2825,8 +2938,8 @@ static int event_run(apr_pool_t * _pconf, apr_pool
     /* Don't thrash since num_buckets depends on the
      * system and the number of online CPU cores...
      */
-    if (ap_daemons_limit < num_buckets)
-        ap_daemons_limit = num_buckets;
+    if (active_daemons_limit < num_buckets)
+        active_daemons_limit = num_buckets;
     if (ap_daemons_to_start < num_buckets)
         ap_daemons_to_start = num_buckets;
     /* We want to create as much children at a time as the number of buckets,
@@ -2850,8 +2963,8 @@ static int event_run(apr_pool_t * _pconf, apr_pool
      * supposed to start up without the 1 second penalty between each fork.
      */
     remaining_children_to_start = ap_daemons_to_start;
-    if (remaining_children_to_start > ap_daemons_limit) {
-        remaining_children_to_start = ap_daemons_limit;
+    if (remaining_children_to_start > active_daemons_limit) {
+        remaining_children_to_start = active_daemons_limit;
     }
     if (!retained->is_graceful) {
         startup_children(remaining_children_to_start);
@@ -2881,7 +2994,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool
          * Kill child processes, tell them to call child_exit, etc...
          */
         for (i = 0; i < num_buckets; i++) {
-            ap_mpm_podx_killpg(all_buckets[i].pod, ap_daemons_limit,
+            ap_mpm_podx_killpg(all_buckets[i].pod, active_daemons_limit,
                                AP_MPM_PODX_RESTART);
         }
         ap_reclaim_child_processes(1, /* Start with SIGTERM */
@@ -2905,7 +3018,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool
         /* Close our listeners, and then ask our children to do same */
         ap_close_listeners();
         for (i = 0; i < num_buckets; i++) {
-            ap_mpm_podx_killpg(all_buckets[i].pod, ap_daemons_limit,
+            ap_mpm_podx_killpg(all_buckets[i].pod, active_daemons_limit,
                                AP_MPM_PODX_GRACEFUL);
         }
         ap_relieve_child_processes(event_note_child_killed);
@@ -2933,7 +3046,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool
             ap_relieve_child_processes(event_note_child_killed);
 
             active_children = 0;
-            for (index = 0; index < ap_daemons_limit; ++index) {
+            for (index = 0; index < retained->max_daemons_limit; ++index) {
                 if (ap_mpm_safe_kill(MPM_CHILD_PID(index), 0) == APR_SUCCESS) {
                     active_children = 1;
                     /* Having just one child is enough to stay around */
@@ -2948,7 +3061,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool
          * really dead.
          */
         for (i = 0; i < num_buckets; i++) {
-            ap_mpm_podx_killpg(all_buckets[i].pod, ap_daemons_limit,
+            ap_mpm_podx_killpg(all_buckets[i].pod, active_daemons_limit,
                                AP_MPM_PODX_RESTART);
         }
         ap_reclaim_child_processes(1, event_note_child_killed);
@@ -2977,7 +3090,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool
                      " received.  Doing graceful restart");
         /* wake up the children...time to die.  But we'll have more soon */
         for (i = 0; i < num_buckets; i++) {
-            ap_mpm_podx_killpg(all_buckets[i].pod, ap_daemons_limit,
+            ap_mpm_podx_killpg(all_buckets[i].pod, active_daemons_limit,
                                AP_MPM_PODX_GRACEFUL);
         }
 
@@ -2992,7 +3105,7 @@ static int event_run(apr_pool_t * _pconf, apr_pool
          * pthreads are stealing signals from us left and right.
          */
         for (i = 0; i < num_buckets; i++) {
-            ap_mpm_podx_killpg(all_buckets[i].pod, ap_daemons_limit,
+            ap_mpm_podx_killpg(all_buckets[i].pod, active_daemons_limit,
                                AP_MPM_PODX_RESTART);
         }
 
@@ -3002,6 +3115,8 @@ static int event_run(apr_pool_t * _pconf, apr_pool
                      "SIGHUP received.  Attempting to restart");
     }
 
+    active_daemons = 0;
+
     return OK;
 }
 
@@ -3215,9 +3330,9 @@ static int event_pre_config(apr_pool_t * pconf, ap
     max_spare_threads = DEFAULT_MAX_FREE_DAEMON * DEFAULT_THREADS_PER_CHILD;
     server_limit = DEFAULT_SERVER_LIMIT;
     thread_limit = DEFAULT_THREAD_LIMIT;
-    ap_daemons_limit = server_limit;
+    active_daemons_limit = server_limit;
     threads_per_child = DEFAULT_THREADS_PER_CHILD;
-    max_workers = ap_daemons_limit * threads_per_child;
+    max_workers = active_daemons_limit * threads_per_child;
     had_healthy_child = 0;
     ap_extended_status = 0;
 
@@ -3426,10 +3541,10 @@ static int event_check_config(apr_pool_t *p, apr_p
         max_workers = threads_per_child;
     }
 
-    ap_daemons_limit = max_workers / threads_per_child;
+    active_daemons_limit = max_workers / threads_per_child;
 
     if (max_workers % threads_per_child) {
-        int tmp_max_workers = ap_daemons_limit * threads_per_child;
+        int tmp_max_workers = active_daemons_limit * threads_per_child;
 
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00513)
@@ -3437,7 +3552,7 @@ static int event_check_config(apr_pool_t *p, apr_p
                          "multiple of ThreadsPerChild of %d, decreasing to nearest "
                          "multiple %d, for a maximum of %d servers.",
                          max_workers, threads_per_child, tmp_max_workers,
-                         ap_daemons_limit);
+                         active_daemons_limit);
         } else {
             ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(00514)
                          "MaxRequestWorkers of %d is not an integer multiple "
@@ -3448,25 +3563,25 @@ static int event_check_config(apr_pool_t *p, apr_p
         max_workers = tmp_max_workers;
     }
 
-    if (ap_daemons_limit > server_limit) {
+    if (active_daemons_limit > server_limit) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00515)
                          "WARNING: MaxRequestWorkers of %d would require %d servers "
                          "and would exceed ServerLimit of %d, decreasing to %d. "
                          "To increase, please see the ServerLimit directive.",
-                         max_workers, ap_daemons_limit, server_limit,
+                         max_workers, active_daemons_limit, server_limit,
                          server_limit * threads_per_child);
         } else {
             ap_log_error(APLOG_MARK, APLOG_WARNING, 0, s, APLOGNO(00516)
                          "MaxRequestWorkers of %d would require %d servers and "
                          "exceed ServerLimit of %d, decreasing to %d",
-                         max_workers, ap_daemons_limit, server_limit,
+                         max_workers, active_daemons_limit, server_limit,
                          server_limit * threads_per_child);
         }
-        ap_daemons_limit = server_limit;
+        active_daemons_limit = server_limit;
     }
 
-    /* ap_daemons_to_start > ap_daemons_limit checked in ap_mpm_run() */
+    /* ap_daemons_to_start > active_daemons_limit checked in ap_mpm_run() */
     if (ap_daemons_to_start < 1) {
         if (startup) {
             ap_log_error(APLOG_MARK, APLOG_WARNING | APLOG_STARTUP, 0, NULL, APLOGNO(00517)
Index: server/mpm/event/fdqueue.c
===================================================================
--- server/mpm/event/fdqueue.c	(revision 1761356)
+++ server/mpm/event/fdqueue.c	(working copy)
@@ -280,6 +280,19 @@ void ap_pop_pool(apr_pool_t ** recycled_pool, fd_q
     }
 }
 
+void ap_free_idle_pools(fd_queue_info_t *queue_info)
+{
+    apr_pool_t *p;
+
+    queue_info->max_recycled_pools = 0;
+    do {
+        ap_pop_pool(&p, queue_info);
+        if (p != NULL)
+            apr_pool_destroy(p);
+    } while (p != NULL);
+}
+
+
 apr_status_t ap_queue_info_term(fd_queue_info_t * queue_info)
 {
     apr_status_t rv;
@@ -477,7 +490,7 @@ apr_status_t ap_queue_pop_something(fd_queue_t * q
     return rv;
 }
 
-apr_status_t ap_queue_interrupt_all(fd_queue_t * queue)
+static apr_status_t queue_interrupt(fd_queue_t * queue, int all)
 {
     apr_status_t rv;
 
@@ -484,10 +497,23 @@ apr_status_t ap_queue_pop_something(fd_queue_t * q
     if ((rv = apr_thread_mutex_lock(queue->one_big_mutex)) != APR_SUCCESS) {
         return rv;
     }
-    apr_thread_cond_broadcast(queue->not_empty);
+    if (all)
+        apr_thread_cond_broadcast(queue->not_empty);
+    else
+        apr_thread_cond_signal(queue->not_empty);
     return apr_thread_mutex_unlock(queue->one_big_mutex);
 }
 
+apr_status_t ap_queue_interrupt_all(fd_queue_t * queue)
+{
+    return queue_interrupt(queue, 1);
+}
+
+apr_status_t ap_queue_interrupt_one(fd_queue_t * queue)
+{
+    return queue_interrupt(queue, 0);
+}
+
 apr_status_t ap_queue_term(fd_queue_t * queue)
 {
     apr_status_t rv;
Index: server/mpm/event/fdqueue.h
===================================================================
--- server/mpm/event/fdqueue.h	(revision 1761356)
+++ server/mpm/event/fdqueue.h	(working copy)
@@ -52,6 +52,7 @@ apr_status_t ap_queue_info_wait_for_idler(fd_queue
                                           int *had_to_block);
 apr_status_t ap_queue_info_term(fd_queue_info_t * queue_info);
 apr_uint32_t ap_queue_info_get_idlers(fd_queue_info_t * queue_info);
+void ap_free_idle_pools(fd_queue_info_t *queue_info);
 
 struct fd_queue_elem_t
 {
@@ -98,6 +99,7 @@ apr_status_t ap_queue_pop_something(fd_queue_t * q
                                     event_conn_state_t ** ecs, apr_pool_t ** p,
                                     timer_event_t ** te);
 apr_status_t ap_queue_interrupt_all(fd_queue_t * queue);
+apr_status_t ap_queue_interrupt_one(fd_queue_t * queue);
 apr_status_t ap_queue_term(fd_queue_t * queue);
 
 #endif /* FDQUEUE_H */
Index: server/mpm_unix.c
===================================================================
--- server/mpm_unix.c	(revision 1761356)
+++ server/mpm_unix.c	(working copy)
@@ -63,7 +63,13 @@
 #undef APLOG_MODULE_INDEX
 #define APLOG_MODULE_INDEX AP_CORE_MODULE_INDEX
 
-typedef enum {DO_NOTHING, SEND_SIGTERM, SEND_SIGKILL, GIVEUP} action_t;
+typedef enum {
+    DO_NOTHING,
+    SEND_SIGTERM,
+    SEND_SIGTERM_NOLOG,
+    SEND_SIGKILL,
+    GIVEUP
+} action_t;
 
 typedef struct extra_process_t {
     struct extra_process_t *next;
@@ -142,6 +148,8 @@ static int reclaim_one_pid(pid_t pid, action_t act
                      " still did not exit, "
                      "sending a SIGTERM",
                      pid);
+        /* FALLTHROUGH */
+    case SEND_SIGTERM_NOLOG:
         kill(pid, SIGTERM);
         break;
 
@@ -193,6 +201,7 @@ AP_DECLARE(void) ap_reclaim_child_processes(int te
                           * children but take no action against
                           * stragglers
                           */
+        {SEND_SIGTERM_NOLOG, 0}, /* skipped if terminate == 0 */
         {SEND_SIGTERM, apr_time_from_sec(3)},
         {SEND_SIGTERM, apr_time_from_sec(5)},
         {SEND_SIGTERM, apr_time_from_sec(7)},
@@ -202,19 +211,21 @@ AP_DECLARE(void) ap_reclaim_child_processes(int te
     int cur_action;      /* index of action we decided to take this
                           * iteration
                           */
-    int next_action = 1; /* index of first real action */
+    int next_action = terminate ? 1 : 2; /* index of first real action */
 
     ap_mpm_query(AP_MPMQ_MAX_DAEMON_USED, &max_daemons);
 
     do {
-        apr_sleep(waittime);
-        /* don't let waittime get longer than 1 second; otherwise, we don't
-         * react quickly to the last child exiting, and taking action can
-         * be delayed
-         */
-        waittime = waittime * 4;
-        if (waittime > apr_time_from_sec(1)) {
-            waittime = apr_time_from_sec(1);
+        if (action_table[next_action].action_time > 0) {
+            apr_sleep(waittime);
+            /* don't let waittime get longer than 1 second; otherwise, we don't
+             * react quickly to the last child exiting, and taking action can
+             * be delayed
+             */
+            waittime = waittime * 4;
+            if (waittime > apr_time_from_sec(1)) {
+                waittime = apr_time_from_sec(1);
+            }
         }
 
         /* see what action to take, if any */
Index: server/scoreboard.c
===================================================================
--- server/scoreboard.c	(revision 1761356)
+++ server/scoreboard.c	(working copy)
@@ -399,7 +399,7 @@ AP_DECLARE(int) ap_find_child_by_pid(apr_proc_t *p
     int i;
     int max_daemons_limit = 0;
 
-    ap_mpm_query(AP_MPMQ_MAX_DAEMONS, &max_daemons_limit);
+    ap_mpm_query(AP_MPMQ_MAX_DAEMON_USED, &max_daemons_limit);
 
     for (i = 0; i < max_daemons_limit; ++i) {
         if (ap_scoreboard_image->parent[i].pid == pid->pid) {

Re: Frequent wake-ups for mpm_event

Reply via email to