On Thu, Apr 14, 2016 at 08:55:47AM +0200, Lukas Tribus wrote:
Le me put it this way:
frontend haproxy_test
bind-process 1-8
bind :12345 process 1
bind :12345 process 2
bind :12345 process 3
bind :12345 process 4
Leads to 8 processes, and the master process binds the socket 4 times
(PID
16509):
(...)
lukas@ubuntuvm:~/haproxy-1.5$ sudo netstat -tlp | grep hap
tcp 0 0 *:12345 *:* LISTEN
16509/haproxy
tcp 0 0 *:12345 *:* LISTEN
16509/haproxy
tcp 0 0 *:12345 *:* LISTEN
16509/haproxy
tcp 0 0 *:12345 *:* LISTEN
16509/haproxy
lukas@ubuntuvm:~/haproxy-1.5$
OK so it's netstat which gives a wrong report, I have the same here. I
verified
in /proc/$PID/fd/ and I properly saw the FDs. Next, "ss -anp" also
shows all the
process list :
LISTEN 0 128 *:12345
*:*
users:(("haproxy",25360,7),("haproxy",25359,7),("haproxy",25358,7),("haproxy",25357,7),("haproxy",25356,7),("haproxy",25355,7),("haproxy",25354,7),("haproxy",25353,7))
LISTEN 0 128 *:12345
*:*
users:(("haproxy",25360,6),("haproxy",25359,6),("haproxy",25358,6),("haproxy",25357,6),("haproxy",25356,6),("haproxy",25355,6),("haproxy",25354,6),("haproxy",25353,6))
LISTEN 0 128 *:12345
*:*
users:(("haproxy",25360,5),("haproxy",25359,5),("haproxy",25358,5),("haproxy",25357,5),("haproxy",25356,5),("haproxy",25355,5),("haproxy",25354,5),("haproxy",25353,5))
LISTEN 0 128 *:12345
*:*
users:(("haproxy",25360,4),("haproxy",25359,4),("haproxy",25358,4),("haproxy",25357,4),("haproxy",25356,4),("haproxy",25355,4),("haproxy",25354,4),("haproxy",25353,4))
A performance test also shows a fair distribution of the load :
25353 willy 20 0 21872 4216 1668 S 26 0.1 0:04.54 haproxy
25374 willy 20 0 7456 108 0 S 25 0.0 0:02.26
injectl464
25376 willy 20 0 7456 108 0 S 25 0.0 0:02.27
injectl464
25377 willy 20 0 7456 108 0 S 25 0.0 0:02.26
injectl464
25375 willy 20 0 7456 108 0 S 24 0.0 0:02.26
injectl464
25354 willy 20 0 21872 4168 1620 R 22 0.1 0:04.51 haproxy
25356 willy 20 0 21872 4216 1668 R 22 0.1 0:04.21 haproxy
25355 willy 20 0 21872 4168 1620 S 21 0.1 0:04.38 haproxy
However, as you can see these sockets are still bound to all processes
and
that's not a good idea in the multi-queue mode.
I have added a few debug lines in enable_listener() like this :
$ git diff
diff --git a/src/listener.c b/src/listener.c
index 5abeb80..59c51a1 100644
--- a/src/listener.c
+++ b/src/listener.c
@@ -49,6 +49,7 @@ static struct bind_kw_list bind_keywords = {
*/
void enable_listener(struct listener *listener)
{
+ fddebug("%d: enabling fd %d\n", getpid(), listener->fd);
if (listener->state == LI_LISTEN) {
if ((global.mode & (MODE_DAEMON | MODE_SYSTEMD)) &&
listener->bind_conf->bind_proc &&
@@ -57,6 +58,7 @@ void enable_listener(struct listener *listener)
* want any fd event to reach it.
*/
fd_stop_recv(listener->fd);
+ fddebug("%d: pausing fd %d\n", getpid(),
listener->fd);
listener->state = LI_PAUSED;
}
else if (listener->nbconn < listener->maxconn) {
And we're seeing this upon startup for processes 25746..25755 :
Thus as you can see that FDs are properly enabled and paused for the
unavailable ones.
willy@wtap:haproxy$ grep 4294967295 log | grep 25746
25746 write(4294967295, "25746: enabling fd 4\n", 21 <unfinished ...>
25746 write(4294967295, "25746: enabling fd 5\n", 21 <unfinished ...>
25746 write(4294967295, "25746: pausing fd 5\n", 20) = -1 EBADF (Bad
file descriptor)
25746 write(4294967295, "25746: enabling fd 6\n", 21) = -1 EBADF (Bad
file descriptor)
25746 write(4294967295, "25746: pausing fd 6\n", 20) = -1 EBADF (Bad
file descriptor)
25746 write(4294967295, "25746: enabling fd 7\n", 21 <unfinished ...>
25746 write(4294967295, "25746: pausing fd 7\n", 20 <unfinished ...>
willy@wtap:haproxy$ grep 4294967295 log | grep 25747
25747 write(4294967295, "25747: enabling fd 4\n", 21 <unfinished ...>
25747 write(4294967295, "25747: pausing fd 4\n", 20 <unfinished ...>
25747 write(4294967295, "25747: enabling fd 5\n", 21 <unfinished ...>
25747 write(4294967295, "25747: enabling fd 6\n", 21 <unfinished ...>
25747 write(4294967295, "25747: pausing fd 6\n", 20 <unfinished ...>
25747 write(4294967295, "25747: enabling fd 7\n", 21 <unfinished ...>
25747 write(4294967295, "25747: pausing fd 7\n", 20 <unfinished ...>
willy@wtap:haproxy$ grep 4294967295 log | grep 25748
25748 write(4294967295, "25748: enabling fd 4\n", 21 <unfinished ...>
25748 write(4294967295, "25748: pausing fd 4\n", 20 <unfinished ...>
25748 write(4294967295, "25748: enabling fd 5\n", 21 <unfinished ...>
25748 write(4294967295, "25748: pausing fd 5\n", 20 <unfinished ...>
25748 write(4294967295, "25748: enabling fd 6\n", 21 <unfinished ...>
25748 write(4294967295, "25748: enabling fd 7\n", 21 <unfinished ...>
25748 write(4294967295, "25748: pausing fd 7\n", 20 <unfinished ...>
willy@wtap:haproxy$ grep 4294967295 log | grep 25749
25749 write(4294967295, "25749: enabling fd 4\n", 21 <unfinished ...>
25749 write(4294967295, "25749: pausing fd 4\n", 20 <unfinished ...>
25749 write(4294967295, "25749: enabling fd 5\n", 21 <unfinished ...>
25749 write(4294967295, "25749: pausing fd 5\n", 20 <unfinished ...>
25749 write(4294967295, "25749: enabling fd 6\n", 21 <unfinished ...>
25749 write(4294967295, "25749: pausing fd 6\n", 20 <unfinished ...>
25749 write(4294967295, "25749: enabling fd 7\n", 21 <unfinished ...>
willy@wtap:haproxy$ grep 4294967295 log | grep 25750
25750 write(4294967295, "25750: enabling fd 4\n", 21 <unfinished ...>
25750 write(4294967295, "25750: pausing fd 4\n", 20 <unfinished ...>
25750 write(4294967295, "25750: enabling fd 5\n", 21 <unfinished ...>
25750 write(4294967295, "25750: pausing fd 5\n", 20 <unfinished ...>
25750 write(4294967295, "25750: enabling fd 6\n", 21 <unfinished ...>
25750 write(4294967295, "25750: pausing fd 6\n", 20 <unfinished ...>
25750 write(4294967295, "25750: enabling fd 7\n", 21 <unfinished ...>
25750 write(4294967295, "25750: pausing fd 7\n", 20 <unfinished ...>
Now with the following patch to completely unbind such listeners :
diff --git a/src/listener.c b/src/listener.c
index 5abeb80..0296d50 100644
--- a/src/listener.c
+++ b/src/listener.c
@@ -56,8 +57,7 @@ void enable_listener(struct listener *listener)
/* we don't want to enable this listener and
don't
* want any fd event to reach it.
*/
- fd_stop_recv(listener->fd);
- listener->state = LI_PAUSED;
+ unbind_listener(listener);
}
else if (listener->nbconn < listener->maxconn) {
fd_want_recv(listener->fd);
I get this which is much cleaner :
LISTEN 0 128 *:12345
*:* users:(("haproxy",25949,7))
LISTEN 0 128 *:12345
*:* users:(("haproxy",25948,6))
LISTEN 0 128 *:12345
*:* users:(("haproxy",25947,5))
LISTEN 0 128 *:12345
*:* users:(("haproxy",25946,4))
So I guess that indeed, if not all the processes a frontend is bound
to
have a corresponding bind line, this can cause connection issues as
some
incoming connections will be distributed to queues that nobody listens
to.
I'm willing to commit this patch to make things cleaner and more
reliable.
Here I'm getting the exact same performance with and without.
Christian you
may want to apply it by hand to test if it improves the behaviour for
you.