On Sun, Sep 18, 2016 at 12:16 PM, Luca Toscano <[email protected]> wrote:
>
> 2016-09-15 11:41 GMT+02:00 Stefan Priebe - Profihost AG
> <[email protected]>:
>>
>> that sounds great. Is there a version available that applies to 2.4 ?
>
> I tried to study/port Yann's patch to 2.4.x. but it is not that
> straightforward since there are a lot of differences between the two
> branches, most of them afaics are related to the skiplist's handling and
> usage.
Patch against 2.4.x attached.
Regards,
Yann.
Index: server/mpm/event/event.c
===================================================================
--- server/mpm/event/event.c (revision 1761316)
+++ server/mpm/event/event.c (working copy)
@@ -100,6 +100,8 @@
#include <limits.h> /* for INT_MAX */
+#define VOLATILE_READ(T, x) (*(volatile T *)&(x));
+
/* Limit on the total --- clients will be locked out if more servers than
* this are needed. It is intended solely to keep the server from crashing
* when things get out of hand.
@@ -174,6 +176,7 @@ static int dying = 0;
static int workers_may_exit = 0;
static int start_thread_may_exit = 0;
static int listener_may_exit = 0;
+static int listener_is_wakeable = 0; /* Pollset supports APR_POLLSET_WAKEABLE */
static int num_listensocks = 0;
static apr_int32_t conns_this_child; /* MaxConnectionsPerChild, only access
in listener thread */
@@ -239,21 +242,45 @@ static struct timeout_queue *write_completion_q,
*keepalive_q,
*linger_q,
*short_linger_q;
+static apr_time_t queues_next_expiry;
static apr_pollfd_t *listener_pollfd;
/*
+ * The pollset for sockets that are in any of the timeout queues. Currently
+ * we use the timeout_mutex to make sure that connections are added/removed
+ * atomically to/from both event_pollset and a timeout queue. Otherwise
+ * some confusion can happen under high load if timeout queues and pollset
+ * get out of sync.
+ * XXX: It should be possible to make the lock unnecessary in many or even all
+ * XXX: cases.
+ */
+static apr_pollset_t *event_pollset;
+
+/*
* Macros for accessing struct timeout_queue.
* For TO_QUEUE_APPEND and TO_QUEUE_REMOVE, timeout_mutex must be held.
*/
-#define TO_QUEUE_APPEND(q, el) \
- do { \
- APR_RING_INSERT_TAIL(&(q)->head, el, event_conn_state_t, \
- timeout_list); \
- ++*(q)->total; \
- ++(q)->count; \
- } while (0)
+static void TO_QUEUE_APPEND(struct timeout_queue *q, event_conn_state_t *el)
+{
+ APR_RING_INSERT_TAIL(&q->head, el, event_conn_state_t, timeout_list);
+ ++*q->total;
+ ++q->count;
+ /* Cheaply update the overall queues' next expiry according to the
+ * first entry of this queue (oldest), if necessary.
+ */
+ el = APR_RING_FIRST(&q->head);
+ if (!queues_next_expiry
+ || queues_next_expiry > el->queue_timestamp + q->timeout) {
+ queues_next_expiry = el->queue_timestamp + q->timeout;
+ /* Unblock the listener if it's waiting on a longer timeout. */
+ if (listener_is_wakeable) {
+ apr_pollset_wakeup(event_pollset);
+ }
+ }
+}
+
#define TO_QUEUE_REMOVE(q, el) \
do { \
APR_RING_REMOVE(el, timeout_list); \
@@ -462,6 +489,11 @@ static void wakeup_listener(void)
return;
}
+ /* unblock the listener if it's poll()ing */
+ if (listener_is_wakeable) {
+ apr_pollset_wakeup(event_pollset);
+ }
+
/* unblock the listener if it's waiting for a worker */
ap_queue_info_term(worker_queue_info);
@@ -656,7 +688,11 @@ static apr_status_t decrement_connection_count(voi
default:
break;
}
- apr_atomic_dec32(&connection_count);
+ /* Unblock the listener if it's waiting for connection_count = 0 */
+ if (!apr_atomic_dec32(&connection_count)
+ && listener_is_wakeable && listener_may_exit) {
+ apr_pollset_wakeup(event_pollset);
+ }
return APR_SUCCESS;
}
@@ -819,6 +855,7 @@ static void notify_resume(event_conn_state_t *cs,
static int start_lingering_close_common(event_conn_state_t *cs, int in_worker)
{
+ int done = 0;
apr_status_t rv;
struct timeout_queue *q;
apr_socket_t *csd = cs->pfd.desc.s;
@@ -830,7 +867,6 @@ static int start_lingering_close_common(event_conn
#else
apr_socket_timeout_set(csd, 0);
#endif
- cs->queue_timestamp = apr_time_now();
/*
* If some module requested a shortened waiting period, only wait for
* 2s (SECONDS_TO_LINGER). This is useful for mitigating certain
@@ -851,25 +887,25 @@ static int start_lingering_close_common(event_conn
else {
cs->c->sbh = NULL;
}
- apr_thread_mutex_lock(timeout_mutex);
- TO_QUEUE_APPEND(q, cs);
cs->pfd.reqevents = (
cs->pub.sense == CONN_SENSE_WANT_WRITE ? APR_POLLOUT :
APR_POLLIN) | APR_POLLHUP | APR_POLLERR;
cs->pub.sense = CONN_SENSE_DEFAULT;
+ cs->queue_timestamp = apr_time_now();
+ apr_thread_mutex_lock(timeout_mutex);
rv = apr_pollset_add(event_pollset, &cs->pfd);
+ if (rv == APR_SUCCESS || APR_STATUS_IS_EEXIST(rv)) {
+ TO_QUEUE_APPEND(q, cs);
+ done = 1;
+ }
apr_thread_mutex_unlock(timeout_mutex);
- if (rv != APR_SUCCESS && !APR_STATUS_IS_EEXIST(rv)) {
+ if (!done) {
ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03092)
"start_lingering_close: apr_pollset_add failure");
- apr_thread_mutex_lock(timeout_mutex);
- TO_QUEUE_REMOVE(q, cs);
- apr_thread_mutex_unlock(timeout_mutex);
apr_socket_close(cs->pfd.desc.s);
ap_push_pool(worker_queue_info, cs->p);
- return 0;
}
- return 1;
+ return done;
}
/*
@@ -1129,15 +1165,15 @@ read_request:
* Set a write timeout for this connection, and let the
* event thread poll for writeability.
*/
- cs->queue_timestamp = apr_time_now();
notify_suspend(cs);
- apr_thread_mutex_lock(timeout_mutex);
- TO_QUEUE_APPEND(cs->sc->wc_q, cs);
cs->pfd.reqevents = (
cs->pub.sense == CONN_SENSE_WANT_READ ? APR_POLLIN :
APR_POLLOUT) | APR_POLLHUP | APR_POLLERR;
cs->pub.sense = CONN_SENSE_DEFAULT;
- rc = apr_pollset_add(event_pollset, &cs->pfd);
+ cs->queue_timestamp = apr_time_now();
+ apr_thread_mutex_lock(timeout_mutex);
+ apr_pollset_add(event_pollset, &cs->pfd);
+ TO_QUEUE_APPEND(cs->sc->wc_q, cs);
apr_thread_mutex_unlock(timeout_mutex);
return;
}
@@ -1166,14 +1202,13 @@ read_request:
* timeout today. With a normal client, the socket will be readable in
* a few milliseconds anyway.
*/
- cs->queue_timestamp = apr_time_now();
notify_suspend(cs);
- apr_thread_mutex_lock(timeout_mutex);
- TO_QUEUE_APPEND(cs->sc->ka_q, cs);
-
/* Add work to pollset. */
cs->pfd.reqevents = APR_POLLIN;
+ cs->queue_timestamp = apr_time_now();
+ apr_thread_mutex_lock(timeout_mutex);
rc = apr_pollset_add(event_pollset, &cs->pfd);
+ TO_QUEUE_APPEND(cs->sc->ka_q, cs);
apr_thread_mutex_unlock(timeout_mutex);
if (rc != APR_SUCCESS) {
@@ -1347,6 +1382,7 @@ static void get_worker(int *have_idle_worker_p, in
static APR_RING_HEAD(timer_free_ring_t, timer_event_t) timer_free_ring;
static apr_skiplist *timer_skiplist;
+static apr_time_t timers_next_expiry;
/* The following compare function is used by apr_skiplist_insert() to keep the
* elements (timers) sorted and provide O(log n) complexity (this is also true
@@ -1396,6 +1432,18 @@ static apr_status_t event_register_timed_callback(
/* Okay, add sorted by when.. */
apr_skiplist_insert(timer_skiplist, te);
+ /* Cheaply update the overall timers' next expiry according to
+ * this event, if necessary.
+ */
+ if (!timers_next_expiry
+ || timers_next_expiry > te->when) {
+ timers_next_expiry = te->when;
+ /* Unblock the listener if it's waiting on a longer timer. */
+ if (listener_is_wakeable) {
+ apr_pollset_wakeup(event_pollset);
+ }
+ }
+
apr_thread_mutex_unlock(g_timer_skiplist_mtx);
return APR_SUCCESS;
@@ -1464,20 +1512,32 @@ static void process_timeout_queue(struct timeout_q
count = 0;
cs = first = last = APR_RING_FIRST(&qp->head);
while (cs != APR_RING_SENTINEL(&qp->head, event_conn_state_t,
- timeout_list)
- /* Trash the entry if:
- * - no timeout_time was given (asked for all), or
- * - it expired (according to the queue timeout), or
- * - the system clock skewed in the past: no entry should be
- * registered above the given timeout_time (~now) + the queue
- * timeout, we won't keep any here (eg. for centuries).
- * Stop otherwise, no following entry will match thanks to the
- * single timeout per queue (entries are added to the end!).
- * This allows maintenance in O(1).
- */
- && (!timeout_time
- || cs->queue_timestamp + qp->timeout < timeout_time
- || cs->queue_timestamp > timeout_time + qp->timeout)) {
+ timeout_list)) {
+ /* Trash the entry if:
+ * - no timeout_time was given (asked for all), or
+ * - it expired (according to the queue timeout), or
+ * - the system clock skewed in the past: no entry should be
+ * registered above the given timeout_time (~now) + the queue
+ * timeout, we won't keep any here (eg. for centuries).
+ *
+ * Otherwise stop, no following entry will match thanks to the
+ * single timeout per queue (entries are added to the end!).
+ * This allows maintenance in O(1).
+ */
+ if (timeout_time
+ && cs->queue_timestamp + qp->timeout > timeout_time
+ && cs->queue_timestamp < timeout_time + qp->timeout) {
+ /* Since this is the next expiring of this queue, update the
+ * overall queues' next expiry if it's later than this one.
+ */
+ apr_time_t cs_expiry = cs->queue_timestamp + qp->timeout;
+ if (!queues_next_expiry
+ || queues_next_expiry > cs_expiry) {
+ queues_next_expiry = cs_expiry;
+ }
+ break;
+ }
+
last = cs;
rv = apr_pollset_remove(event_pollset, &cs->pfd);
if (rv != APR_SUCCESS && !APR_STATUS_IS_NOTFOUND(rv)) {
@@ -1514,11 +1574,11 @@ static void process_timeout_queue(struct timeout_q
static void * APR_THREAD_FUNC listener_thread(apr_thread_t * thd, void *dummy)
{
- timer_event_t *ep;
timer_event_t *te;
apr_status_t rc;
proc_info *ti = dummy;
int process_slot = ti->pid;
+ struct process_score *ps = ap_get_scoreboard_process(process_slot);
apr_pool_t *tpool = apr_thread_pool_get(thd);
void *csd = NULL;
apr_pool_t *ptrans; /* Pool for per-transaction stuff */
@@ -1527,9 +1587,9 @@ static void * APR_THREAD_FUNC listener_thread(apr_
const apr_pollfd_t *out_pfd;
apr_int32_t num = 0;
apr_interval_time_t timeout_interval;
- apr_time_t timeout_time = 0, now, last_log;
listener_poll_type *pt;
int closed = 0, listeners_disabled = 0;
+ apr_time_t last_log;
last_log = apr_time_now();
free(ti);
@@ -1558,7 +1618,9 @@ static void * APR_THREAD_FUNC listener_thread(apr_
apr_signal(LISTENER_SIGNAL, dummy_signal_handler);
for (;;) {
+ apr_time_t now, timeout_time;
int workers_were_busy = 0;
+
if (listener_may_exit) {
close_listeners(process_slot, &closed);
if (terminate_mode == ST_UNGRACEFUL
@@ -1569,7 +1631,12 @@ static void * APR_THREAD_FUNC listener_thread(apr_
if (conns_this_child <= 0)
check_infinite_requests();
+ /* Update poll() timeout below according to the next expiring
+ * timer or queue entry, if any.
+ */
+ timeout_interval = -1;
now = apr_time_now();
+
if (APLOGtrace6(ap_server_conf)) {
/* trace log status every second */
if (now - last_log > apr_time_from_msec(1000)) {
@@ -1588,24 +1655,61 @@ static void * APR_THREAD_FUNC listener_thread(apr_
}
}
- apr_thread_mutex_lock(g_timer_skiplist_mtx);
- te = apr_skiplist_peek(timer_skiplist);
- if (te) {
- if (te->when > now) {
- timeout_interval = te->when - now;
+ /* Avoid locking if there's no expiring timer in the list,
+ * poll() will be woken up anyway if a new timer comes in.
+ */
+ timeout_time = VOLATILE_READ(apr_time_t, timers_next_expiry);
+ if (timeout_time && timeout_time < now + EVENT_FUDGE_FACTOR) {
+ /* Push expired timers to a worker, the first one remaining
+ * determines the maximum time to poll() below.
+ */
+ apr_thread_mutex_lock(g_timer_skiplist_mtx);
+ while ((te = apr_skiplist_peek(timer_skiplist))) {
+ if (te->when < now + EVENT_FUDGE_FACTOR) {
+ apr_skiplist_pop(timer_skiplist, NULL);
+ push_timer2worker(te);
+ }
+ else {
+ timeout_interval = te->when - now;
+ timers_next_expiry = te->when;
+ break;
+ }
}
- else {
- timeout_interval = 1;
+ /* If there are no timers in the list, either the listener is
+ * wakeable and it can poll() indefinitely until a wake up occurs,
+ * or periodic checks must be performed.
+ */
+ if (!te) {
+ if (!listener_is_wakeable) {
+ timeout_interval = apr_time_from_msec(100);
+ }
+ timers_next_expiry = 0;
}
+ apr_thread_mutex_unlock(g_timer_skiplist_mtx);
}
- else {
+
+ /* Same for queues, if the listener is wakeable use the current expiry
+ * time and expect to be woken up for an earlier one, otherwise use the
+ * maintenance timeout (max).
+ */
+ timeout_time = VOLATILE_READ(apr_time_t, queues_next_expiry);
+ if (timeout_time
+ && (timeout_interval < 0
+ || timeout_time <= now
+ || timeout_interval > timeout_time - now)) {
+ timeout_interval = timeout_time > now ? timeout_time - now : 1;
+ }
+ if (!listener_is_wakeable
+ && timeout_interval > apr_time_from_msec(100)) {
timeout_interval = apr_time_from_msec(100);
}
- apr_thread_mutex_unlock(g_timer_skiplist_mtx);
rc = apr_pollset_poll(event_pollset, timeout_interval, &num, &out_pfd);
if (rc != APR_SUCCESS) {
if (APR_STATUS_IS_EINTR(rc)) {
+ /* Woken up, either update timeouts or shutdown,
+ * both logics are above.
+ */
continue;
}
if (!APR_STATUS_IS_TIMEUP(rc)) {
@@ -1614,6 +1718,7 @@ static void * APR_THREAD_FUNC listener_thread(apr_
"shutdown process gracefully");
signal_threads(ST_GRACEFUL);
}
+ num = 0;
}
if (listener_may_exit) {
@@ -1623,21 +1728,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_
break;
}
- now = apr_time_now();
- apr_thread_mutex_lock(g_timer_skiplist_mtx);
- ep = apr_skiplist_peek(timer_skiplist);
- while (ep) {
- if (ep->when < now + EVENT_FUDGE_FACTOR) {
- apr_skiplist_pop(timer_skiplist, NULL);
- push_timer2worker(ep);
- }
- else {
- break;
- }
- ep = apr_skiplist_peek(timer_skiplist);
- }
- apr_thread_mutex_unlock(g_timer_skiplist_mtx);
-
while (num) {
pt = (listener_poll_type *) out_pfd->client_data;
if (pt->type == PT_CSD) {
@@ -1802,34 +1892,25 @@ static void * APR_THREAD_FUNC listener_thread(apr_
/* XXX possible optimization: stash the current time for use as
* r->request_time for new requests
*/
- now = apr_time_now();
- /* We only do this once per 0.1s (TIMEOUT_FUDGE_FACTOR), or on a clock
- * skew (if the system time is set back in the meantime, timeout_time
- * will exceed now + TIMEOUT_FUDGE_FACTOR, can't happen otherwise).
+ /* We process the timeout queues here only when their overall next
+ * expiry (read once above) is over. This happens accurately since
+ * adding to the queues (in workers) can only decrease this expiry,
+ * while latest ones are only taken into account here (in listener)
+ * during queues' processing, with the lock held. This works both
+ * with and without wake-ability.
*/
- if (now > timeout_time || now + TIMEOUT_FUDGE_FACTOR < timeout_time ) {
- struct process_score *ps;
+ if (timeout_time && timeout_time < (now = apr_time_now())) {
timeout_time = now + TIMEOUT_FUDGE_FACTOR;
/* handle timed out sockets */
apr_thread_mutex_lock(timeout_mutex);
+ /* Processing all the queues below will recompute this. */
+ queues_next_expiry = 0;
+
/* Step 1: keepalive timeouts */
- /* If all workers are busy, we kill older keep-alive connections so that they
- * may connect to another process.
- */
- if (workers_were_busy && *keepalive_q->total) {
- ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
- "All workers are busy, will close %d keep-alive "
- "connections",
- *keepalive_q->total);
- process_timeout_queue(keepalive_q, 0,
- start_lingering_close_nonblocking);
- }
- else {
- process_timeout_queue(keepalive_q, timeout_time,
- start_lingering_close_nonblocking);
- }
+ process_timeout_queue(keepalive_q, timeout_time,
+ start_lingering_close_nonblocking);
/* Step 2: write completion timeouts */
process_timeout_queue(write_completion_q, timeout_time,
start_lingering_close_nonblocking);
@@ -1838,7 +1919,6 @@ static void * APR_THREAD_FUNC listener_thread(apr_
/* Step 4: (short) lingering close completion timeouts */
process_timeout_queue(short_linger_q, timeout_time, stop_lingering_close);
- ps = ap_get_scoreboard_process(process_slot);
ps->write_completion = *write_completion_q->total;
ps->keep_alive = *keepalive_q->total;
apr_thread_mutex_unlock(timeout_mutex);
@@ -1847,6 +1927,20 @@ static void * APR_THREAD_FUNC listener_thread(apr_
ps->suspended = apr_atomic_read32(&suspended_count);
ps->lingering_close = apr_atomic_read32(&lingering_count);
}
+ else if (workers_were_busy && *keepalive_q->total) {
+ /* If all workers are busy, we kill older keep-alive connections so that they
+ * may connect to another process.
+ */
+ ap_log_error(APLOG_MARK, APLOG_TRACE1, 0, ap_server_conf,
+ "All workers are busy, will close %d keep-alive "
+ "connections", *keepalive_q->total);
+ apr_thread_mutex_lock(timeout_mutex);
+ process_timeout_queue(keepalive_q, 0,
+ start_lingering_close_nonblocking);
+ ps->keep_alive = 0;
+ apr_thread_mutex_unlock(timeout_mutex);
+ }
+
if (listeners_disabled && !workers_were_busy
&& (int)apr_atomic_read32(&connection_count)
- (int)apr_atomic_read32(&lingering_count)
@@ -2027,6 +2121,8 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
int prev_threads_created;
int max_recycled_pools = -1;
int good_methods[] = {APR_POLLSET_KQUEUE, APR_POLLSET_PORT, APR_POLLSET_EPOLL};
+ /* XXX don't we need more to handle K-A or lingering close? */
+ const apr_uint32_t pollset_size = threads_per_child * 2;
/* We must create the fd queues before we start up the listener
* and worker threads. */
@@ -2066,24 +2162,24 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
/* Create the main pollset */
for (i = 0; i < sizeof(good_methods) / sizeof(void*); i++) {
- rv = apr_pollset_create_ex(&event_pollset,
- threads_per_child*2, /* XXX don't we need more, to handle
- * connections in K-A or lingering
- * close?
- */
- pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY | APR_POLLSET_NODEFAULT,
- good_methods[i]);
+ apr_uint32_t flags = APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY |
+ APR_POLLSET_NODEFAULT | APR_POLLSET_WAKEABLE;
+ rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags,
+ good_methods[i]);
if (rv == APR_SUCCESS) {
+ listener_is_wakeable = 1;
break;
}
+ flags &= ~APR_POLLSET_WAKEABLE;
+ rv = apr_pollset_create_ex(&event_pollset, pollset_size, pchild, flags,
+ good_methods[i]);
+ if (rv == APR_SUCCESS) {
+ break;
+ }
}
if (rv != APR_SUCCESS) {
- rv = apr_pollset_create(&event_pollset,
- threads_per_child*2, /* XXX don't we need more, to handle
- * connections in K-A or lingering
- * close?
- */
- pchild, APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY);
+ rv = apr_pollset_create(&event_pollset, pollset_size, pchild,
+ APR_POLLSET_THREADSAFE | APR_POLLSET_NOCOPY);
}
if (rv != APR_SUCCESS) {
ap_log_error(APLOG_MARK, APLOG_ERR, rv, ap_server_conf, APLOGNO(03103)
@@ -2092,7 +2188,9 @@ static void *APR_THREAD_FUNC start_threads(apr_thr
}
ap_log_error(APLOG_MARK, APLOG_DEBUG, 0, ap_server_conf, APLOGNO(02471)
- "start_threads: Using %s", apr_pollset_method_name(event_pollset));
+ "start_threads: Using %s (%swakeable)",
+ apr_pollset_method_name(event_pollset),
+ listener_is_wakeable ? "" : "not ");
worker_sockets = apr_pcalloc(pchild, threads_per_child
* sizeof(apr_socket_t *));