Hi,

I'm very sorry that my skills in gdb and knowledge of HAProxy and C are not
sufficient for this debugging process.

With the patch applied I tried again to use spoa from
"contrib/spoa_example/". Example spoa agent does not understand my
spoe-message and silently ignores it, but it doesn't matter.

To trigger segmentation fault I must reload HAProxy (when using spoa from
an external vendor this additional reload wasn't necessary, I've just had
to wait a couple seconds to trigger crash).

Usually HAProxy crashes on process_stream, but once it crashed at
(long)h2s->subs & 1 check in testcorrupt during spoe_release_appctx
#0  0x00005597450c25f9 in testcorrupt (ptr=0x7f4fb8071990) at
src/mux_h2.c:6238
        cs = 0x7f4fb8071990
        h2s = 0x7f4fe85751f0
#1  0x0000559745196239 in spoe_release_appctx (appctx=0x7f4fe8324e00) at
src/flt_spoe.c:1294
        si = 0x7f4fe82b31f8
        spoe_appctx = 0x7f4fe88dd760
        agent = 0x559746052580
        ctx = 0x7f4fe8380b80
        back = 0x559746355b38

Then I tried again to replicate the bug on my r&d server this time making
HAProxy reloads (multiple times) during the test and it crashed.

HAProxy was compiled with git HEAD set to
77015abe0bcfde67bff519b1d48393a513015f77 with patch
0001-EXP-try-to-spot-where-h2s-subs-changes-V2.patch applied
and with modified h2s:

diff --git a/src/mux_h2.c b/src/mux_h2.c
index 9928b32c7..3d5187271 100644
--- a/src/mux_h2.c
+++ b/src/mux_h2.c
@@ -206,6 +206,8 @@ struct h2s {
  uint16_t status;     /* HTTP response status */
  unsigned long long body_len; /* remaining body length according to
content-length if H2_SF_DATA_CLEN */
  struct buffer rxbuf; /* receive buffer, always valid (buf_empty or real
buffer) */
+ struct tasklet *dummy0;
+ struct wait_event *dummy1;
  struct wait_event *subs;      /* recv wait_event the conn_stream
associated is waiting on (via h2_subscribe) */
  struct list list; /* To be used when adding in h2c->send_list or
h2c->fctl_lsit */
  struct tasklet *shut_tl;  /* deferred shutdown tasklet, to retry to send
an RST after we failed to,

Attached:
haproxy.cfg (/etc/haproxy/haproxy.cfg main config)
spoe-example.conf (/etc/haproxy/spoe-example.conf spoe config)

I used spoa from contrib/spoa_example run with command:
"./spoa -p 4545  -c fragmentation -c async -c pipelining"

I used vegeta to generate traffic: https://github.com/tsenart/vegeta with
command:
"cat input | ./vegeta attack -duration=360s -insecure   -keepalive=false
 -http2=true -rate=500/1s > /dev/null"
I used 2 virtual machines to generate traffic and additionally I've
launched vegeta on host with HAProxy

where input file is:
GET https://haproxy-crash.test.local/
zdebek:
sdofijdsoifjodisjfoisdjfoisdovisoivjdfoijvoisdjvopsdijg0934u49032ut09gir09j40g9u0492it093i2g09i0r9bi2490ib094i0b9i09i0924it09bi2222itk42jh09tj4309sdfjdlsjfoadiwe9023i0r92094i4309gi0934ig9034ig093i4g90i3409gi3409gi0394ig0934i0g93jjoujgiurhjgiuerhgiurehgiuerhg89489u098u509u09wrut0923ej23fjjsufdsuf98dusf98u98u2398uf9834uf983u49f8h98huish9fsdu98fusd98uf982u398u3298ru2938uffhsdijhfisdjhiusdhfiu2iuhf2398289823189831893198931udashidsah

The reloaded HAProxy configuration (multiple times, again and again until
segmentation fault occurred):
haproxy -f /etc/haproxy/haproxy.cfg -p /var/run/haproxy.pid -D -sf 10608

pon., 9 lis 2020 o 16:01 Maciej Zdeb <mac...@zdeb.pl> napisał(a):

> It crashed now on first test in process_stream:
>
> struct task *process_stream(struct task *t, void *context, unsigned short
> state)
> {
>         struct server *srv;
>         struct stream *s = context;
>         struct session *sess = s->sess;
>         unsigned int rqf_last, rpf_last;
>         unsigned int rq_prod_last, rq_cons_last;
>         unsigned int rp_cons_last, rp_prod_last;
>         unsigned int req_ana_back;
>         struct channel *req, *res;
>         struct stream_interface *si_f, *si_b;
>         unsigned int rate;
>
>         TEST_STRM(s);
> [...]
>
> Program terminated with signal SIGSEGV, Segmentation fault.
> #0  0x000055f4cda7b5f9 in testcorrupt (ptr=0x7f75ac1ed990) at
> src/mux_h2.c:6238
> [Current thread is 1 (Thread 0x7f75a98b9700 (LWP 5860))]
> (gdb) bt full
> #0  0x000055f4cda7b5f9 in testcorrupt (ptr=0x7f75ac1ed990) at
> src/mux_h2.c:6238
>         cs = 0x7f75ac1ed990
>         h2s = 0x7f7584244510
> #1  0x000055f4cdad8993 in process_stream (t=0x7f75ac139d70,
> context=0x7f7588066540, state=260) at src/stream.c:1499
>         srv = 0x7f75a9896390
>         s = 0x7f7588066540
>         sess = 0x7f759c071b80
>         rqf_last = 4294967294
>         rpf_last = 2217468112
>         rq_prod_last = 32629
>         rq_cons_last = 2217603024
>         rp_cons_last = 32629
>         rp_prod_last = 2217182865
>         req_ana_back = 2217603025
>         req = 0x7f75a9896350
>         res = 0x55f4cdbed618 <__task_queue+92>
>         si_f = 0x55f4ce03c680 <task_per_thread+896>
>         si_b = 0x7f75842def80
>         rate = 2217603024
> #2  0x000055f4cdbeddb2 in run_tasks_from_list (list=0x55f4ce03c6c0
> <task_per_thread+960>, max=150) at src/task.c:371
>         process = 0x55f4cdad892d <process_stream>
>         t = 0x7f75ac139d70
>         state = 260
>         ctx = 0x7f7588066540
>         done = 3
> [...]
>
> subs is 0xffffffff like before BUT dummy1 is also changed to 0xffff
>
> (gdb) p *(struct h2s*)(0x7f7584244510)
> $1 = {cs = 0x7f75ac1ed990, sess = 0x55f4ce02be40 <pool_cache+7328>, h2c =
> 0x7f758417abd0, h1m = {state = H1_MSG_RPBEFORE, flags = 12, curr_len = 0,
> body_len = 0, next = 0, err_pos = -1, err_state = 0}, by_id = {node = {
>       branches = {b = {0x7f758428e430, 0x7f7584244550}}, node_p =
> 0x7f758428e431, leaf_p = 0x7f7584244551, bit = 1, pfx = 33828}, key = 23},
> id = 23, flags = 16385, sws = 0, errcode = H2_ERR_NO_ERROR, st = H2_SS_HREM,
>   status = 0, body_len = 0, rxbuf = {size = 16384, area = 0x7f75780a2210
> "Ð?", data = 16384, head = 0}, dummy0 = 0x0, dummy1 = 0xffff, subs =
> 0xffffffff, list = {n = 0x7f75842445c8, p = 0x7f75842445c8},
>   shut_tl = 0x7f75842df0d0}
>
> pon., 9 lis 2020 o 15:07 Christopher Faulet <cfau...@haproxy.com>
> napisał(a):
>
>> Le 09/11/2020 à 13:10, Maciej Zdeb a écrit :
>> > I've played little bit with the patch and it led me to backend.c file
>> and
>> > connect_server() function
>> >
>> > int connect_server(struct stream *s)
>> > {
>> > [...]
>> > if (!conn_xprt_ready(srv_conn) && !srv_conn->mux) {
>> >                  /* set the correct protocol on the output stream
>> interface */
>> >                  if (srv)
>> >                          conn_prepare(srv_conn,
>> > protocol_by_family(srv_conn->dst->ss_family), srv->xprt);
>> >                  else if (obj_type(s->target) == OBJ_TYPE_PROXY) {
>> >                          /* proxies exclusively run on raw_sock right
>> now */
>> >                          conn_prepare(srv_conn,
>> > protocol_by_family(srv_conn->dst->ss_family), xprt_get(XPRT_RAW));
>> >                          if (!(srv_conn->ctrl)) {
>> >                                  conn_free(srv_conn);
>> >                                  return SF_ERR_INTERNAL;
>> >                          }
>> >                  }
>> >                  else {
>> >                          conn_free(srv_conn);
>> >                          return SF_ERR_INTERNAL;  /* how did we get
>> there ? */
>> >                  }
>> > // THIS ONE IS OK
>> > TEST_STRM(s);
>> > //////////////////////////////
>> >                  srv_cs = si_alloc_cs(&s->si[1], srv_conn);
>> > // FAIL
>> > TEST_STRM(s);
>> > //////////////////////////////
>> >                  if (!srv_cs) {
>> >                          conn_free(srv_conn);
>> >                          return SF_ERR_RESOURCE;
>> >                  }
>>
>> Hi,
>>
>> In fact, this crash occurs because of the Willy's patch. It was not
>> design to
>> handle non-h2 connections. Here the crash happens on a TCP connection,
>> used by a
>> SPOE applet for instance.
>>
>> I updated its patch. First, I added some calls to TEST_STRM() in the SPOE
>> code,
>> to be sure. I also explicitly set the stream task to NULL in
>> stream_free() to
>> catch late wakeups in the SPOE. Finally, I modified testcorrupt(). I hope
>> this
>> one is correct. But if I missed something, you may only keep the last
>> ABORT_NOW() in testcorrupt() and replace others by a return statement,
>> just like
>> in the Willy's patch.
>>
>> --
>> Christopher Faulet
>>
>

Attachment: spoe-example.conf
Description: Binary data

Attachment: haproxy.cfg
Description: Binary data

Reply via email to