Hi Pieter,

On Thu, May 03, 2018 at 11:48:56PM +0200, PiBa-NL wrote:
> Hi Thierry,
> 
> Op 3-5-2018 om 8:59 schreef Thierry Fournier:
> > he bug. I even installed a FreeBSD:-)  I add Willy in
> > copy, maybe he will reproduce it.
> > 
> > Thierry
> 
> The 'trick' is probably sending as few requests as possible through a 'high
> latency' vpn (17ms for a ping from client to haproxy machine..).
> 
> Haproxy startup.
>     Line 17: 00000000:TestSite.clireq[0007:ffffffff]: GET /haproxy?stats
> HTTP/1.1
>     Line 34: 00000002:TestSite.clireq[0007:ffffffff]: GET /favicon.ico
> HTTP/1.1
>     Line 44: 00000001:TestSite.clireq[0008:ffffffff]: GET
> /webrequest/mailstat HTTP/1.1
>     Line 133: 00000003:TestSite.clireq[0008:ffffffff]: GET
> /webrequest/mailstat HTTP/1.1
>     Line 220: 00000004:TestSite.clireq[0008:ffffffff]: GET /favicon.ico
> HTTP/1.1
>     Line 233: 00000005:TestSite.clireq[0008:ffffffff]: GET /haproxy?stats
> HTTP/1.1
>     Line 251: 00000006:TestSite.clireq[0008:ffffffff]: GET
> /webrequest/mailstat HTTP/1.1
> Crash..
> 
> Sometimes it takes a few more but its not really consistently.. Its rather
> timing sensitive i guess..
> 
> 
> But besides the reproduction, how is the theory behind the tasks and their
> cleanup how 'should' it work?
> Chrome browser makes a few requests to haproxy for stats page and the other
> for the and lua service (and a favicon in between.)..
> 
> At one point in time the tcp connection for the lua service gets closed and
> the process_stream starts to call the si_shutw.. a few calls deeper
> hlua_applet_http_release removes the http task from the list..
> 
> static void hlua_applet_http_release(struct appctx *ctx)
> {
>     task_delete(ctx->ctx.hlua_apphttp.task);
>     task_free(ctx->ctx.hlua_apphttp.task);
> 
> Then when the current task is 'done' it will move to the next one.. the
> rq_next in the process loop..that however is pointing to the deleted/freed
> hlua_apphttp.task..?.. So getting the next task from that already destroyed
> element will fail...
> 
> Perhaps something like the patch below could work?
> Does it make sense? (Same should then be done for tcp and cli tasks i
> guess..)
> For my testcase it doesn't crash anymore with that change. But i'm not sure
> if now its leaking memory instead for some cases.. Is there a easy way to
> check?
> 
> Regards,
> PiBa-NL (Pieter)
> 

Thanks a lot for the detailed analysis. That seems spot on.
We decided to do something a bit different than your proposed fix.
Does the attached patch fix your problems ?

Thanks again !

Olivier
>From 2ec1b9806cfc9b8425a85e402f08c46be3bd9130 Mon Sep 17 00:00:00 2001
From: Olivier Houchard <ohouch...@haproxy.com>
Date: Fri, 4 May 2018 15:46:16 +0200
Subject: [PATCH] BUG/MEDIUM: task: Don't free a task that is about to be run.

While running a task, we may try to delete and free a task that is about to
be run, because it's part of the local tasks list, or because rq_next points
to it.
So flag any task that is in the local tasks list to be deleted, instead of
run, by setting t->process to NULL, and re-make rq_next a global,
thread-local variable, that is modified if we attempt to delete that task.

Many thanks to PiBa-NL for reporting this and analysing the problem.

This should be backported to 1.8.
---
 include/proto/task.h | 21 +++++++++++++++++++--
 src/task.c           | 22 +++++++++++++++++-----
 2 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/include/proto/task.h b/include/proto/task.h
index cbc1a9072..c1c4c07ec 100644
--- a/include/proto/task.h
+++ b/include/proto/task.h
@@ -90,6 +90,8 @@ extern unsigned int nb_tasks_cur;
 extern unsigned int niced_tasks;  /* number of niced tasks in the run queue */
 extern struct pool_head *pool_head_task;
 extern struct pool_head *pool_head_notification;
+extern THREAD_LOCAL struct task *curr_task; /* task currently running or NULL 
*/
+extern THREAD_LOCAL struct eb32sc_node *rq_next; /* Next task to be 
potentially run */
 
 __decl_hathreads(extern HA_SPINLOCK_T rq_lock);  /* spin lock related to run 
queue */
 __decl_hathreads(extern HA_SPINLOCK_T wq_lock);  /* spin lock related to wait 
queue */
@@ -177,8 +179,11 @@ static inline struct task *__task_unlink_rq(struct task *t)
 static inline struct task *task_unlink_rq(struct task *t)
 {
        HA_SPIN_LOCK(TASK_RQ_LOCK, &rq_lock);
-       if (likely(task_in_rq(t)))
+       if (likely(task_in_rq(t))) {
+               if (&t->rq == rq_next)
+                       rq_next = eb32sc_next(rq_next, tid_bit);
                __task_unlink_rq(t);
+       }
        HA_SPIN_UNLOCK(TASK_RQ_LOCK, &rq_lock);
        return t;
 }
@@ -230,7 +235,7 @@ static inline struct task *task_new(unsigned long 
thread_mask)
  * Free a task. Its context must have been freed since it will be lost.
  * The task count is decremented.
  */
-static inline void task_free(struct task *t)
+static inline void __task_free(struct task *t)
 {
        pool_free(pool_head_task, t);
        if (unlikely(stopping))
@@ -238,6 +243,18 @@ static inline void task_free(struct task *t)
        HA_ATOMIC_SUB(&nb_tasks, 1);
 }
 
+static inline void task_free(struct task *t)
+{
+       /* There's no need to protect t->state with a lock, as the task
+        * has to run on the current thread.
+        */
+       if (t == curr_task || !(t->state & TASK_RUNNING))
+               __task_free(t);
+       else
+               t->process = NULL;
+}
+
+
 /* Place <task> into the wait queue, where it may already be. If the expiration
  * timer is infinite, do nothing and rely on wake_expired_task to clean up.
  */
diff --git a/src/task.c b/src/task.c
index fd9acf66d..3d021bb4c 100644
--- a/src/task.c
+++ b/src/task.c
@@ -39,6 +39,7 @@ unsigned int nb_tasks_cur = 0;     /* copy of the tasks count 
*/
 unsigned int niced_tasks = 0;      /* number of niced tasks in the run queue */
 
 THREAD_LOCAL struct task *curr_task = NULL; /* task currently running or NULL 
*/
+THREAD_LOCAL struct eb32sc_node *rq_next = NULL; /* Next task to be 
potentially run */
 
 __decl_hathreads(HA_SPINLOCK_T __attribute__((aligned(64))) rq_lock); /* spin 
lock related to run queue */
 __decl_hathreads(HA_SPINLOCK_T __attribute__((aligned(64))) wq_lock); /* spin 
lock related to wait queue */
@@ -186,7 +187,6 @@ void process_runnable_tasks()
        struct task *t;
        int i;
        int max_processed;
-       struct eb32sc_node *rq_next;
        struct task *local_tasks[16];
        int local_tasks_count;
        int final_tasks_count;
@@ -227,8 +227,14 @@ void process_runnable_tasks()
                         */
                        if (likely(t->process == process_stream))
                                t = process_stream(t);
-                       else
-                               t = t->process(t);
+                       else {
+                               if (t->process != NULL)
+                                       t = t->process(t);
+                               else {
+                                       __task_free(t);
+                                       t = NULL;
+                               }
+                       }
                        curr_task = NULL;
 
                        if (likely(t != NULL)) {
@@ -309,8 +315,14 @@ void process_runnable_tasks()
                        curr_task = t;
                        if (likely(t->process == process_stream))
                                t = process_stream(t);
-                       else
-                               t = t->process(t);
+                       else {
+                               if (t->process != NULL)
+                                       t = t->process(t);
+                               else {
+                                       __task_free(t);
+                                       t = NULL;
+                               }
+                       }
                        curr_task = NULL;
                        if (t)
                                local_tasks[final_tasks_count++] = t;
-- 
2.14.3

Reply via email to