Hi,

The attached patchset is the first cut at an attempt to work around the
linux issues with SOREUSEPORT that makes haproxy refuse a few new connections
under heavy load.
This works by transferring the existing sockets to the new process via the
stats socket. A new command-line flag has been added, -x, that takes the
path to the unix socket as an argument, and if set, will attempt to retrieve
all the listening sockets;
Right now, any error, either while connecting to the socket, or retrieving
the file descriptors, is fatal, but maybe it'd be better to just fall back
to the previous behavior instead of opening any missing socket ? I'm still
undecided about that.

Any testing, comments, etc would be greatly appreciated.

Regards,

Olivier
>From f2a13d1ce2f182170f70fe3d5312a538788f5877 Mon Sep 17 00:00:00 2001
From: Olivier Houchard <ohouch...@haproxy.com>
Date: Wed, 5 Apr 2017 22:24:59 +0200
Subject: [PATCH 1/6] MINOR: cli: Add a command to send listening sockets.

Add a new command that will send all the listening sockets, via the
stats socket, and their properties.
This is a first step to workaround the linux problem when reloading
haproxy.
---
 include/types/connection.h |   8 ++
 src/cli.c                  | 179 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 187 insertions(+)

diff --git a/include/types/connection.h b/include/types/connection.h
index 5ce5e0c..9d1b51a 100644
--- a/include/types/connection.h
+++ b/include/types/connection.h
@@ -389,6 +389,14 @@ struct tlv_ssl {
 #define PP2_CLIENT_CERT_CONN     0x02
 #define PP2_CLIENT_CERT_SESS     0x04
 
+
+/*
+ * Linux seems to be able to send 253 fds per sendmsg(), not sure
+ * about the other OSes.
+ */
+/* Max number of file descriptors we send in one sendmsg() */
+#define MAX_SEND_FD 253
+
 #endif /* _TYPES_CONNECTION_H */
 
 /*
diff --git a/src/cli.c b/src/cli.c
index fa45db9..37fb9fe 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -24,6 +24,8 @@
 #include <sys/stat.h>
 #include <sys/types.h>
 
+#include <net/if.h>
+
 #include <common/cfgparse.h>
 #include <common/compat.h>
 #include <common/config.h>
@@ -1013,6 +1015,182 @@ static int bind_parse_level(char **args, int cur_arg, 
struct proxy *px, struct b
        return 0;
 }
 
+/* Send all the bound sockets, always returns 1 */
+static int _getsocks(char **args, struct appctx *appctx, void *private)
+{
+       char *cmsgbuf = NULL;
+       unsigned char *tmpbuf = NULL;
+       struct cmsghdr *cmsg;
+       struct stream_interface *si = appctx->owner;
+       struct connection *remote = objt_conn(si_opposite(si)->end);
+       struct msghdr msghdr;
+       struct iovec iov;
+       int *tmpfd;
+       int tot_fd_nb = 0;
+       struct proxy *px;
+       int i = 0;
+       int fd = remote->t.sock.fd;
+       int curoff = 0;
+       int old_fcntl;
+       int ret;
+
+       /* Temporary set the FD in blocking mode, that will make our life 
easier */
+       old_fcntl = fcntl(fd, F_GETFL);
+       if (old_fcntl < 0) {
+               Warning("Couldn't get the flags for the unix socket\n");
+               goto out;
+       }
+       cmsgbuf = malloc(CMSG_SPACE(sizeof(int) * MAX_SEND_FD));
+       if (!cmsgbuf) {
+               Warning("Failed to allocate memory to send sockets\n");
+               goto out;
+       }
+       if (fcntl(fd, F_SETFL, old_fcntl &~ O_NONBLOCK) == -1) {
+               Warning("Cannot make the unix socket blocking\n");
+               goto out;
+       }
+       iov.iov_base = &tot_fd_nb;
+       iov.iov_len = sizeof(tot_fd_nb);
+       if (!cli_has_level(appctx, ACCESS_LVL_ADMIN))
+               goto out;
+       memset(&msghdr, 0, sizeof(msghdr));
+       /*
+        * First, calculates the total number of FD, so that we can let
+        * the caller know how much he should expects.
+        */
+       px = proxy;
+       while (px) {
+               struct listener *l;
+
+               list_for_each_entry(l, &px->conf.listeners, by_fe) {
+                       /* Only transfer IPv4/IPv6 sockets */
+                       if (l->proto->sock_family == AF_INET ||
+                           l->proto->sock_family == AF_INET6)
+                               tot_fd_nb++;
+               }
+               px = px->next;
+       }
+       if (tot_fd_nb == 0)
+               goto out;
+
+       /* First send the total number of file descriptors, so that the
+        * receiving end knows what to expect.
+        */
+       msghdr.msg_iov = &iov;
+       msghdr.msg_iovlen = 1;
+       ret = sendmsg(fd, &msghdr, 0);
+       if (ret != sizeof(tot_fd_nb)) {
+               Warning("Failed to send the number of sockets to send\n");
+               goto out;
+       }
+
+       /* Now send the fds */
+       msghdr.msg_control = cmsgbuf;
+       msghdr.msg_controllen = CMSG_SPACE(sizeof(int) * MAX_SEND_FD);
+       cmsg = CMSG_FIRSTHDR(&msghdr);
+       cmsg->cmsg_len = CMSG_LEN(MAX_SEND_FD * sizeof(int));
+       cmsg->cmsg_level = SOL_SOCKET;
+       cmsg->cmsg_type = SCM_RIGHTS;
+       tmpfd = (int *)CMSG_DATA(cmsg);
+
+       px = proxy;
+       /* For each socket, e message is sent, containing the following :
+        *  Size of the namespace name (or 0 if none), as an unsigned char.
+        *  The namespace name, if any
+        *  Size of the interface name (or 0 if none), as an unsigned char
+        *  The interface name, if any
+        *  Listener options, as an int.
+        */
+       /* We will send sockets MAX_SEND_FD per MAX_SEND_FD, allocate a
+        * buffer big enough to store the socket informations.
+        */
+       tmpbuf = malloc(MAX_SEND_FD * (1 + NAME_MAX + 1 + IFNAMSIZ + 
sizeof(int)));
+       if (tmpbuf == NULL) {
+               Warning("Failed to allocate memory to transfer socket 
informations\n");
+               goto out;
+       }
+       iov.iov_base = tmpbuf;
+       while (px) {
+               struct listener *l;
+
+               list_for_each_entry(l, &px->conf.listeners, by_fe) {
+                       int ret;
+                       /* Only transfer IPv4/IPv6 sockets */
+                       if (l->state >= LI_LISTEN &&
+                           (l->proto->sock_family == AF_INET ||
+                           l->proto->sock_family == AF_INET6)) {
+                               memcpy(&tmpfd[i % MAX_SEND_FD], &l->fd, 
sizeof(l->fd));
+                               if (!l->netns)
+                                       tmpbuf[curoff++] = 0;
+#ifdef CONFIG_HAP_NS
+                               else {
+                                       char *name = l->netns->node.key;
+                                       unsigned char len = l->netns->name_len;
+                                       tmpbuf[curoff++] = len;
+                                       memcpy(tmpbuf + curoff, name, len);
+                                       curoff += len;
+                               }
+#endif
+                               if (l->interface) {
+                                       unsigned char len = 
strlen(l->interface);
+                                       tmpbuf[curoff++] = len;
+                                       memcpy(tmpbuf + curoff, l->interface, 
len);
+                               curoff += len;
+                               } else
+                                       tmpbuf[curoff++] = 0;
+                               memcpy(tmpbuf + curoff, &l->options,
+                                   sizeof(l->options));
+                               curoff += sizeof(l->options);
+
+
+                               i++;
+                       } else
+                               continue;
+                       if ((!(i % MAX_SEND_FD))) {
+                               iov.iov_len = curoff;
+                               if (sendmsg(fd, &msghdr, 0) != curoff) {
+                                       Warning("Failed to transfer sockets\n");
+                                       printf("errno %d\n", errno);
+                                       goto out;
+                               }
+                               /* Wait for an ack */
+                               do {
+                                       ret = recv(fd, &tot_fd_nb,
+                                           sizeof(tot_fd_nb), 0);
+                               } while (ret == -1 && errno == EINTR);
+                               if (ret <= 0) {
+                                       Warning("Unexpected error while 
transferring sockets\n");
+                                       goto out;
+                               }
+                               curoff = 0;
+                       }
+
+               }
+               px = px->next;
+       }
+       if (i % MAX_SEND_FD) {
+               iov.iov_len = curoff;
+               cmsg->cmsg_len = CMSG_LEN((i % MAX_SEND_FD) * sizeof(int));
+               msghdr.msg_controllen = CMSG_SPACE(sizeof(int) *  (i % 
MAX_SEND_FD));
+               if (sendmsg(fd, &msghdr, 0) != curoff) {
+                       Warning("Failed to transfer sockets\n");
+                       goto out;
+               }
+       }
+
+out:
+       if (old_fcntl >= 0 && fcntl(fd, F_SETFL, old_fcntl) == -1) {
+               Warning("Cannot make the unix socket non-blocking\n");
+               goto out;
+       }
+       appctx->st0 = CLI_ST_END;
+       free(cmsgbuf);
+       free(tmpbuf);
+       return 1;
+}
+
+
+
 static struct applet cli_applet = {
        .obj_type = OBJ_TYPE_APPLET,
        .name = "<CLI>", /* used for logging */
@@ -1027,6 +1205,7 @@ static struct cli_kw_list cli_kws = {{ },{
        { { "set", "timeout",  NULL }, "set timeout    : change a timeout 
setting", cli_parse_set_timeout, NULL, NULL },
        { { "show", "env",  NULL }, "show env [var] : dump environment 
variables known to the process", cli_parse_show_env, cli_io_handler_show_env, 
NULL },
        { { "show", "cli", "sockets",  NULL }, "show cli sockets : dump list of 
cli sockets", cli_parse_default, cli_io_handler_show_cli_sock, NULL },
+       { { "_getsocks", NULL }, NULL,  _getsocks, NULL },
        {{},}
 }};
 
-- 
2.9.3

>From 82c00bae30bfa026aa446cc9c099ee4507aaa941 Mon Sep 17 00:00:00 2001
From: Olivier Houchard <ohouch...@haproxy.com>
Date: Wed, 5 Apr 2017 22:33:04 +0200
Subject: [PATCH 2/6] MINOR: global: Add an option to get the old listening
 sockets.

Add the "-x" flag, that takes a path to a unix socket as an argument. If
used, haproxy will connect to the socket, and asks to get all the
listening sockets from the old process. Any failure is fatal.
This is needed to get seamless reloads on linux.
---
 include/proto/listener.h |   1 +
 include/types/listener.h |  10 ++
 src/haproxy.c            | 244 ++++++++++++++++++++++++++++++++++++++++++++++-
 src/listener.c           |   2 +
 4 files changed, 256 insertions(+), 1 deletion(-)

diff --git a/include/proto/listener.h b/include/proto/listener.h
index 0af4563..552d695 100644
--- a/include/proto/listener.h
+++ b/include/proto/listener.h
@@ -144,6 +144,7 @@ static inline struct bind_conf *bind_conf_alloc(struct 
proxy *fe, const char *fi
        return bind_conf;
 }
 
+extern struct xfer_sock_list *xfer_sock_list;
 #endif /* _PROTO_LISTENER_H */
 
 /*
diff --git a/include/types/listener.h b/include/types/listener.h
index 1f94b0b..227cc28 100644
--- a/include/types/listener.h
+++ b/include/types/listener.h
@@ -246,6 +246,16 @@ struct bind_kw_list {
 };
 
 
+struct xfer_sock_list {
+       int fd;
+       char *iface;
+       char *namespace;
+       int options; /* socket options LI_O_* */
+       struct xfer_sock_list *prev;
+       struct xfer_sock_list *next;
+       struct sockaddr_storage addr;
+};
+
 #endif /* _TYPES_LISTENER_H */
 
 /*
diff --git a/src/haproxy.c b/src/haproxy.c
index bb097c4..fabf156 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -39,6 +39,7 @@
 #include <netinet/tcp.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
+#include <net/if.h>
 #include <netdb.h>
 #include <fcntl.h>
 #include <errno.h>
@@ -109,6 +110,10 @@
 #include <proto/dns.h>
 #include <proto/vars.h>
 
+#ifndef CMSG_ALIGN
+#define CMSG_ALIGN(len) (((len) + sizeof (size_t) - 1) \
+                        & (size_t) ~(sizeof (size_t) - 1))
+#endif
 
 /* list of config files */
 static struct list cfg_cfgfiles = LIST_HEAD_INIT(cfg_cfgfiles);
@@ -169,6 +174,9 @@ int jobs = 0;   /* number of active jobs (conns, listeners, 
active tasks, ...) *
 static int *oldpids = NULL;
 static int oldpids_sig; /* use USR1 or TERM */
 
+/* Path to the unix socket we use to retrieve listener sockets from the old 
process */
+static const char *old_unixsocket;
+
 /* this is used to drain data, and as a temporary buffer for sprintf()... */
 struct chunk trash = { };
 
@@ -372,6 +380,7 @@ static void usage(char *name)
                "        -dr ignores server address resolution failures\n"
                "        -dV disables SSL verify on servers side\n"
                "        -sf/-st [pid ]* finishes/terminates old pids.\n"
+               "        -x <unix_socket> get listening sockets from a unix 
socket\n"
                "\n",
                name, DEFAULT_MAXCONN, cfg_maxpconn);
        exit(1);
@@ -561,6 +570,214 @@ next_dir_entry:
        free(err);
 }
 
+static int get_old_sockets(const char *unixsocket)
+{
+       char *cmsgbuf = NULL, *tmpbuf = NULL;
+       int *tmpfd = NULL;
+       struct sockaddr_un addr;
+       struct cmsghdr *cmsg;
+       struct msghdr msghdr;
+       struct iovec iov;
+       struct xfer_sock_list *xfer_sock = NULL;
+       int sock = -1;
+       int ret = -1;
+       int ret2 = -1;
+       int fd_nb;
+       int got_fd = 0;
+       int i = 0;
+       size_t maxoff = 0, curoff = 0;
+
+       memset(&msghdr, 0, sizeof(msghdr));
+       cmsgbuf = malloc(CMSG_SPACE(sizeof(int)) * MAX_SEND_FD);
+       if (!cmsgbuf) {
+               Warning("Failed to allocate memory to send sockets\n");
+               goto out;
+       }
+       sock = socket(PF_UNIX, SOCK_STREAM, 0);
+       if (sock < 0) {
+               Warning("Failed to connect to the old process socket '%s'\n",
+                   unixsocket);
+               goto out;
+       }
+       strncpy(addr.sun_path, unixsocket, sizeof(addr.sun_path));
+       addr.sun_path[sizeof(addr.sun_path) - 1] = 0;
+       addr.sun_family = PF_UNIX;
+       ret = connect(sock, (struct sockaddr *)&addr, sizeof(addr));
+       if (ret < 0) {
+               Warning("Failed to connect to the old process socket '%s'\n",
+                   unixsocket);
+               goto out;
+       }
+       iov.iov_base = &fd_nb;
+       iov.iov_len = sizeof(fd_nb);
+       msghdr.msg_iov = &iov;
+       msghdr.msg_iovlen = 1;
+       send(sock, "_getsocks\n", strlen("_getsocks\n"), 0);
+       /* First, get the number of file descriptors to be received */
+       if (recvmsg(sock, &msghdr, MSG_WAITALL) != sizeof(fd_nb)) {
+               Warning("Failed to get the number of sockets to be transferred 
!\n");
+               goto out;
+       }
+       if (fd_nb == 0) {
+               ret = 0;
+               goto out;
+       }
+       tmpbuf = malloc(fd_nb * (1 + NAME_MAX + 1 + IFNAMSIZ + sizeof(int)));
+       if (tmpbuf == NULL) {
+               Warning("Failed to allocate memory while receiving sockets\n");
+               goto out;
+       }
+       tmpfd = malloc(fd_nb * sizeof(int));
+       if (tmpfd == NULL) {
+               Warning("Failed to allocate memory while receiving sockets\n");
+               goto out;
+       }
+       msghdr.msg_control = cmsgbuf;
+       msghdr.msg_controllen = CMSG_SPACE(sizeof(int)) * MAX_SEND_FD;
+       iov.iov_len = MAX_SEND_FD * (1 + NAME_MAX + 1 + IFNAMSIZ + sizeof(int));
+       do {
+               int ret3;
+
+               iov.iov_base = tmpbuf + curoff;
+               ret = recvmsg(sock, &msghdr, 0);
+               if (ret == -1 && errno == EINTR)
+                       continue;
+               if (ret <= 0)
+                       break;
+               /* Send an ack to let the sender know we got the sockets
+                * and it can send some more
+                */
+               do {
+                       ret3 = send(sock, &got_fd, sizeof(got_fd), 0);
+               } while (ret3 == -1 && errno == EINTR);
+               for (cmsg = CMSG_FIRSTHDR(&msghdr); cmsg != NULL;
+                   cmsg = CMSG_NXTHDR(&msghdr, cmsg)) {
+                       if (cmsg->cmsg_level == SOL_SOCKET &&
+                           cmsg->cmsg_type == SCM_RIGHTS) {
+                               size_t totlen = cmsg->cmsg_len -
+                                   CMSG_ALIGN(sizeof(struct cmsghdr));
+                               if (totlen / sizeof(int) + got_fd > fd_nb) {
+                                       Warning("Got to many sockets !\n");
+                                       goto out;
+                               }
+                               /*
+                                * Be paranoid and use memcpy() to avoid any
+                                * potential alignement issue.
+                                */
+                               memcpy(&tmpfd[got_fd], CMSG_DATA(cmsg), totlen);
+                               got_fd += totlen / sizeof(int);
+                       }
+               }
+               curoff += ret;
+       } while (got_fd < fd_nb);
+
+       if (got_fd != fd_nb) {
+               Warning("We didn't get the expected number of sockets 
(expecting %d got %d)\n",
+                   fd_nb, got_fd);
+               goto out;
+       }
+       maxoff = curoff;
+       curoff = 0;
+       for (i = 0; i < got_fd; i++) {
+               int fd = tmpfd[i];
+               socklen_t socklen;
+               int len;
+
+               xfer_sock = calloc(1, sizeof(*xfer_sock));
+               if (!xfer_sock) {
+                       Warning("Failed to allocate memory in get_old_sockets() 
!\n");
+                       break;
+               }
+               xfer_sock->fd = -1;
+
+               socklen = sizeof(xfer_sock->addr);
+               if (getsockname(fd, (struct sockaddr *)&xfer_sock->addr, 
&socklen) != 0) {
+                       Warning("Failed to get socket address\n");
+                       free(xfer_sock);
+                       continue;
+               }
+               if (curoff >= maxoff) {
+                       Warning("Inconsistency while transferring sockets\n");
+                       goto out;
+               }
+               len = tmpbuf[curoff++];
+               if (len > 0) {
+                       /* We have a namespace */
+                       if (curoff + len > maxoff) {
+                               Warning("Inconsistency while transferring 
sockets\n");
+                               goto out;
+                       }
+                       xfer_sock->namespace = malloc(len + 1);
+                       if (!xfer_sock->namespace) {
+                               Warning("Failed to allocate memory while 
transferring sockets\n");
+                               goto out;
+                       }
+                       memcpy(xfer_sock->namespace, &tmpbuf[curoff], len);
+                       xfer_sock->namespace[len] = 0;
+                       curoff += len;
+               }
+               if (curoff >= maxoff) {
+                       Warning("Inconsistency while transferring sockets\n");
+                       goto out;
+               }
+               len = tmpbuf[curoff++];
+               if (len > 0) {
+                       /* We have an interface */
+                       if (curoff + len > maxoff) {
+                               Warning("Inconsistency while transferring 
sockets\n");
+                               goto out;
+                       }
+                       xfer_sock->iface = malloc(len + 1);
+                       if (!xfer_sock->iface) {
+                               Warning("Failed to allocate memory while 
transferring sockets\n");
+                               goto out;
+                       }
+                       memcpy(xfer_sock->iface, &tmpbuf[curoff], len);
+                       xfer_sock->namespace[len] = 0;
+                       curoff += len;
+               }
+               if (curoff + sizeof(int) > maxoff) {
+                       Warning("Inconsistency while transferring sockets\n");
+                       goto out;
+               }
+               memcpy(&xfer_sock->options, &tmpbuf[curoff],
+                   sizeof(xfer_sock->options));
+               curoff += sizeof(xfer_sock->options);
+
+               xfer_sock->fd = fd;
+               if (xfer_sock_list)
+                       xfer_sock_list->prev = xfer_sock;
+               xfer_sock->next = xfer_sock_list;
+               xfer_sock->prev = NULL;
+               xfer_sock_list = xfer_sock;
+               xfer_sock = NULL;
+       }
+
+       ret2 = 0;
+out:
+       /* If we failed midway make sure to close the remaining
+        * file descriptors
+        */
+       if (tmpfd != NULL && i < got_fd) {
+               for (; i < got_fd; i++) {
+                       close(tmpfd[i]);
+               }
+       }
+       free(tmpbuf);
+       free(tmpfd);
+       free(cmsgbuf);
+       if (sock != -1)
+               close(sock);
+       if (xfer_sock) {
+               free(xfer_sock->namespace);
+               free(xfer_sock->iface);
+               if (xfer_sock->fd != -1)
+                       close(xfer_sock->fd);
+               free(xfer_sock);
+       }
+       return (ret2);
+}
+
 /*
  * This function initializes all the necessary variables. It only returns
  * if everything is OK. If something fails, it exits.
@@ -713,6 +930,15 @@ static void init(int argc, char **argv)
                        }
                        else if (*flag == 'q')
                                arg_mode |= MODE_QUIET;
+                       else if (*flag == 'x') {
+                               if (argv[1][0] == '-') {
+                                       Alert("Unix socket path expected with 
the -x flag\n");
+                                       exit(1);
+                               }
+                               old_unixsocket = argv[1];
+                               argv++;
+                               argc--;
+                       }
                        else if (*flag == 's' && (flag[1] == 'f' || flag[1] == 
't')) {
                                /* list of pids to finish ('f') or terminate 
('t') */
 
@@ -720,7 +946,6 @@ static void init(int argc, char **argv)
                                        oldpids_sig = SIGUSR1; /* finish then 
exit */
                                else
                                        oldpids_sig = SIGTERM; /* terminate 
immediately */
-
                                while (argc > 1 && argv[1][0] != '-') {
                                        oldpids = realloc(oldpids, (nb_oldpids 
+ 1) * sizeof(int));
                                        if (!oldpids) {
@@ -1688,6 +1913,12 @@ int main(int argc, char **argv)
 #endif
        }
 
+       if (old_unixsocket) {
+               if (get_old_sockets(old_unixsocket) != 0) {
+                       Alert("Failed to get the sockets from the old 
process!\n");
+                       exit(1);
+               }
+       }
        /* We will loop at most 100 times with 10 ms delay each time.
         * That's at most 1 second. We only send a signal to old pids
         * if we cannot grab at least one port.
@@ -1748,6 +1979,17 @@ int main(int argc, char **argv)
        } else if (err & ERR_WARN) {
                Alert("[%s.main()] %s.\n", argv[0], errmsg);
        }
+       /* Ok, all listener should now be bound, close any leftover sockets
+        * the previous process gave us, we don't need them anymore
+        */
+       while (xfer_sock_list != NULL) {
+               struct xfer_sock_list *tmpxfer = xfer_sock_list->next;
+               close(xfer_sock_list->fd);
+               free(xfer_sock_list->iface);
+               free(xfer_sock_list->namespace);
+               free(xfer_sock_list);
+               xfer_sock_list = tmpxfer;
+       }
 
        /* prepare pause/play signals */
        signal_register_fct(SIGTTOU, sig_pause, SIGTTOU);
diff --git a/src/listener.c b/src/listener.c
index 5f34911..7a1df0e 100644
--- a/src/listener.c
+++ b/src/listener.c
@@ -42,6 +42,8 @@ static struct bind_kw_list bind_keywords = {
        .list = LIST_HEAD_INIT(bind_keywords.list)
 };
 
+struct xfer_sock_list *xfer_sock_list = NULL;
+
 /* This function adds the specified listener's file descriptor to the polling
  * lists if it is in the LI_LISTEN state. The listener enters LI_READY or
  * LI_FULL state depending on its number of connections. In deamon mode, we
-- 
2.9.3

>From 29c4b7a45b7f1a9178787bf64b46ae122aaea0dd Mon Sep 17 00:00:00 2001
From: Olivier Houchard <ohouch...@haproxy.com>
Date: Wed, 5 Apr 2017 22:39:56 +0200
Subject: [PATCH 3/6] MINOR: tcp: When binding socket, attempt to reuse one
 from the old proc.

Try to reuse any socket from the old process, provided by the "-x" flag,
before binding a new one, assuming it is compatible.
"Compatible" here means same address and port, same namspace if any,
same interface if any, and that the following flags are the same :
LI_O_FOREIGN, LI_O_V6ONLY and LI_O_V4V6.
Also change tcp_bind_listener() to always enable/disable socket options,
instead of just doing so if it is in the configuration file, as the option
may have been removed, ie TCP_FASTOPEN may have been set in the old process,
and removed from the new configuration, so we have to disable it.
---
 src/proto_tcp.c | 122 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 120 insertions(+), 2 deletions(-)

diff --git a/src/proto_tcp.c b/src/proto_tcp.c
index 5e12b99..82994fe 100644
--- a/src/proto_tcp.c
+++ b/src/proto_tcp.c
@@ -731,6 +731,83 @@ int tcp_connect_probe(struct connection *conn)
        return 0;
 }
 
+/* XXX: Should probably be elsewhere */
+static int compare_sockaddr(struct sockaddr_storage *a, struct 
sockaddr_storage *b)
+{
+       if (a->ss_family != b->ss_family) {
+               return (-1);
+       }
+       switch (a->ss_family) {
+       case AF_INET:
+               {
+                       struct sockaddr_in *a4 = (void *)a, *b4 = (void *)b;
+                       if (a4->sin_port != b4->sin_port)
+                               return (-1);
+                       return (memcmp(&a4->sin_addr, &b4->sin_addr,
+                           sizeof(a4->sin_addr)));
+               }
+       case AF_INET6:
+               {
+                       struct sockaddr_in6 *a6 = (void *)a, *b6 = (void *)b;
+                       if (a6->sin6_port != b6->sin6_port)
+                               return (-1);
+                       return (memcmp(&a6->sin6_addr, &b6->sin6_addr,
+                           sizeof(a6->sin6_addr)));
+               }
+       default:
+               return (-1);
+       }
+
+}
+
+#define LI_MANDATORY_FLAGS     (LI_O_FOREIGN | LI_O_V6ONLY | LI_O_V4V6)
+/* When binding the listeners, check if a socket has been sent to us by the
+ * previous process that we could reuse, instead of creating a new one.
+ */
+static int tcp_find_compatible_fd(struct listener *l)
+{
+       struct xfer_sock_list *xfer_sock = xfer_sock_list;
+       int ret = -1;
+
+       while (xfer_sock) {
+               if (!compare_sockaddr(&xfer_sock->addr, &l->addr)) {
+                       if ((l->interface == NULL && xfer_sock->iface == NULL) 
||
+                           (l->interface != NULL && xfer_sock->iface != NULL &&
+                            !strcmp(l->interface, xfer_sock->iface))) {
+                               if ((l->options & LI_MANDATORY_FLAGS) ==
+                                   (xfer_sock->options & LI_MANDATORY_FLAGS)) {
+                                       if ((xfer_sock->namespace == NULL &&
+                                           l->netns == NULL)
+#ifdef CONFIG_HAP_NS
+                                           || (xfer_sock->namespace != NULL &&
+                                           l->netns != NULL &&
+                                           !strcmp(xfer_sock->namespace,
+                                           l->netns->node.key))
+#endif
+                                          ) {
+                                               break;
+                                       }
+
+                               }
+                       }
+               }
+               xfer_sock = xfer_sock->next;
+       }
+       if (xfer_sock != NULL) {
+               ret = xfer_sock->fd;
+               if (xfer_sock == xfer_sock_list)
+                       xfer_sock_list = xfer_sock->next;
+               if (xfer_sock->prev)
+                       xfer_sock->prev->next = xfer_sock->next;
+               if (xfer_sock->next)
+                       xfer_sock->next->prev = xfer_sock->next->prev;
+               free(xfer_sock->iface);
+               free(xfer_sock->namespace);
+               free(xfer_sock);
+       }
+       return ret;
+}
+#undef L1_MANDATORY_FLAGS
 
 /* This function tries to bind a TCPv4/v6 listener. It may return a warning or
  * an error message in <errmsg> if the message is at most <errlen> bytes long
@@ -762,6 +839,9 @@ int tcp_bind_listener(struct listener *listener, char 
*errmsg, int errlen)
 
        err = ERR_NONE;
 
+       if (listener->fd == -1)
+               listener->fd = tcp_find_compatible_fd(listener);
+
        /* if the listener already has an fd assigned, then we were offered the
         * fd by an external process (most likely the parent), and we don't want
         * to create a new socket. However we still want to set a few flags on
@@ -800,6 +880,20 @@ int tcp_bind_listener(struct listener *listener, char 
*errmsg, int errlen)
 
        if (listener->options & LI_O_NOLINGER)
                setsockopt(fd, SOL_SOCKET, SO_LINGER, &nolinger, sizeof(struct 
linger));
+       else {
+               struct linger tmplinger;
+               socklen_t len = sizeof(tmplinger);
+               if (getsockopt(fd, SOL_SOCKET, SO_LINGER, &tmplinger, &len) == 
0 &&
+                   (tmplinger.l_onoff == 0 || tmplinger.l_linger == 0)) {
+                       /* Attempt to activate SO_LINGER, not sure what a sane
+                        * value is, using the default BSD value of 120s.
+                        */
+                       tmplinger.l_onoff = 1;
+                       tmplinger.l_linger = 120;
+                       setsockopt(fd, SOL_SOCKET, SO_LINGER, &tmplinger,
+                           sizeof(tmplinger));
+               }
+       }
 
 #ifdef SO_REUSEPORT
        /* OpenBSD and Linux 3.9 support this. As it's present in old libc 
versions of
@@ -870,6 +964,9 @@ int tcp_bind_listener(struct listener *listener, char 
*errmsg, int errlen)
                        err |= ERR_WARN;
                }
        }
+       /* XXX: No else, the way to get the default MSS will vary from system
+        * to system.
+        */
 #endif
 #if defined(TCP_USER_TIMEOUT)
        if (listener->tcp_ut) {
@@ -878,7 +975,9 @@ int tcp_bind_listener(struct listener *listener, char 
*errmsg, int errlen)
                        msg = "cannot set TCP User Timeout";
                        err |= ERR_WARN;
                }
-       }
+       } else
+               setsockopt(fd, IPPROTO_TCP, TCP_USER_TIMEOUT, &zero,
+                   sizeof(zero));
 #endif
 #if defined(TCP_DEFER_ACCEPT)
        if (listener->options & LI_O_DEF_ACCEPT) {
@@ -888,7 +987,9 @@ int tcp_bind_listener(struct listener *listener, char 
*errmsg, int errlen)
                        msg = "cannot enable DEFER_ACCEPT";
                        err |= ERR_WARN;
                }
-       }
+       } else
+               setsockopt(fd, IPPROTO_TCP, TCP_DEFER_ACCEPT, &zero,
+                   sizeof(zero));
 #endif
 #if defined(TCP_FASTOPEN)
        if (listener->options & LI_O_TCP_FO) {
@@ -898,6 +999,21 @@ int tcp_bind_listener(struct listener *listener, char 
*errmsg, int errlen)
                        msg = "cannot enable TCP_FASTOPEN";
                        err |= ERR_WARN;
                }
+       } else {
+               socklen_t len;
+               int qlen;
+               len = sizeof(qlen);
+               /* Only disable fast open if it was enabled, we don't want
+                * the kernel to create a fast open queue if there's none.
+                */
+               if (getsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, &qlen, &len) == 0 
&&
+                   qlen != 0) {
+                       if (setsockopt(fd, IPPROTO_TCP, TCP_FASTOPEN, &zero,
+                           sizeof(zero)) == -1) {
+                               msg = "cannot disable TCP_FASTOPEN";
+                               err |= ERR_WARN;
+                       }
+               }
        }
 #endif
 #if defined(IPV6_V6ONLY)
@@ -928,6 +1044,8 @@ int tcp_bind_listener(struct listener *listener, char 
*errmsg, int errlen)
 #if defined(TCP_QUICKACK)
        if (listener->options & LI_O_NOQUICKACK)
                setsockopt(fd, IPPROTO_TCP, TCP_QUICKACK, &zero, sizeof(zero));
+       else
+               setsockopt(fd, IPPROTO_TCP, TCP_QUICKACK, &one, sizeof(one));
 #endif
 
        /* the socket is ready */
-- 
2.9.3

>From 3054b5017ec3fbe5c027a2e355e4f5bdfa1cc2fa Mon Sep 17 00:00:00 2001
From: Olivier Houchard <ohouch...@haproxy.com>
Date: Wed, 5 Apr 2017 22:50:59 +0200
Subject: [PATCH 4/6] MINOR: doc: document the -x flag

---
 doc/haproxy.1      | 8 +++++++-
 doc/management.txt | 5 +++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/doc/haproxy.1 b/doc/haproxy.1
index 0a1b2f5..91f58a3 100644
--- a/doc/haproxy.1
+++ b/doc/haproxy.1
@@ -6,7 +6,7 @@ HAProxy \- fast and reliable http reverse proxy and load 
balancer
 
 .SH SYNOPSIS
 
-haproxy \-f <configuration\ file|dir> [\-L\ <name>] [\-n\ maxconn] [\-N\ 
maxconn] [\-C\ <dir>] [\-v|\-vv] [\-d] [\-D] [\-q] [\-V] [\-c] [\-p\ <pidfile>] 
[\-dk] [\-ds] [\-de] [\-dp] [\-db] [\-dM[<byte>]] [\-m\ <megs>] [{\-sf|\-st}\ 
pidlist...]
+haproxy \-f <configuration\ file|dir> [\-L\ <name>] [\-n\ maxconn] [\-N\ 
maxconn] [\-C\ <dir>] [\-v|\-vv] [\-d] [\-D] [\-q] [\-V] [\-c] [\-p\ <pidfile>] 
[\-dk] [\-ds] [\-de] [\-dp] [\-db] [\-dM[<byte>]] [\-m\ <megs>] [\-x 
<unix_socket>] [{\-sf|\-st}\ pidlist...]
 
 .SH DESCRIPTION
 
@@ -155,6 +155,12 @@ which receive this signal will terminate immediately, 
closing all active
 sessions. This option must be specified last, followed by any number of
 PIDs. Technically speaking, \fBSIGTTOU\fP and \fBSIGTERM\fP are sent.
 
+.TP
+\f8\-x <unix_socket>\fP
+Attempt to connect to the unix socket, and retrieve all the listening sockets
+from the old process. Those sockets will then be used if possible instead of
+binding new ones.
+
 .SH LOGGING
 Since HAProxy can run inside a chroot, it cannot reliably access /dev/log.
 For this reason, it uses the UDP protocol to send its logs to the server,
diff --git a/doc/management.txt b/doc/management.txt
index 1d34f84..3368277 100644
--- a/doc/management.txt
+++ b/doc/management.txt
@@ -277,6 +277,11 @@ list of options is :
   -vv : display the version, build options, libraries versions and usable
     pollers. This output is systematically requested when filing a bug report.
 
+  -x <unix_socket> : connect to the specified socket and try to retrieve any
+    listening sockets from the old process, and use them instead of trying to
+    bind new ones. This is useful to avoid missing any new connection when
+    reloading the configuration on Linux.
+
 A safe way to start HAProxy from an init file consists in forcing the daemon
 mode, storing existing pids to a pid file and using this pid file to notify
 older processes to finish before leaving :
-- 
2.9.3

>From 427c414e0b8bb2ba4463d549d63f74027d3f0e7e Mon Sep 17 00:00:00 2001
From: Olivier Houchard <ohouch...@haproxy.com>
Date: Thu, 6 Apr 2017 01:05:05 +0200
Subject: [PATCH 5/6] MINOR: proxy: Don't close FDs if not our proxy.

When running with multiple process, if some proxies are just assigned
to some processes, the other processes will just close the file descriptors
for the listening sockets. However, we may still have to provide those
sockets when reloading, so instead we just try hard to pretend those proxies
are dead, while keeping the sockets opened.
---
 include/proto/fd.h       |  5 +++++
 include/proto/listener.h |  5 +++++
 include/proto/proxy.h    |  1 +
 src/fd.c                 | 21 +++++++++++++++++--
 src/haproxy.c            | 11 +++++++++-
 src/listener.c           | 33 ++++++++++++++++++++++--------
 src/proxy.c              | 52 ++++++++++++++++++++++++++++++++++++++++++++++++
 7 files changed, 117 insertions(+), 11 deletions(-)

diff --git a/include/proto/fd.h b/include/proto/fd.h
index 87309bf..1efe323 100644
--- a/include/proto/fd.h
+++ b/include/proto/fd.h
@@ -41,6 +41,11 @@ extern int fd_nbupdt;               // number of updates in 
the list
  */
 void fd_delete(int fd);
 
+/* Deletes an FD from the fdsets, and recomputes the maxfd limit.
+ * The file descriptor is kept open.
+ */
+void fd_remove(int fd);
+
 /* disable the specified poller */
 void disable_poller(const char *poller_name);
 
diff --git a/include/proto/listener.h b/include/proto/listener.h
index 552d695..079d976 100644
--- a/include/proto/listener.h
+++ b/include/proto/listener.h
@@ -88,6 +88,11 @@ void dequeue_all_listeners(struct list *list);
  */
 int unbind_listener(struct listener *listener);
 
+/* This function pretends the listener is dead, but keeps the FD opened, so
+ * that we can provide it, for conf reloading.
+ */
+int unbind_listener_no_close(struct listener *listener);
+
 /* This function closes all listening sockets bound to the protocol <proto>,
  * and the listeners end in LI_ASSIGNED state if they were higher. It does not
  * detach them from the protocol. It always returns ERR_NONE.
diff --git a/include/proto/proxy.h b/include/proto/proxy.h
index 72f1e1d..e5d20c4 100644
--- a/include/proto/proxy.h
+++ b/include/proto/proxy.h
@@ -42,6 +42,7 @@ void soft_stop(void);
 int pause_proxy(struct proxy *p);
 int resume_proxy(struct proxy *p);
 void stop_proxy(struct proxy *p);
+void zombify_proxy(struct proxy *p);
 void pause_proxies(void);
 void resume_proxies(void);
 int  stream_set_backend(struct stream *s, struct proxy *be);
diff --git a/src/fd.c b/src/fd.c
index aeee602..1a62f9a 100644
--- a/src/fd.c
+++ b/src/fd.c
@@ -175,7 +175,7 @@ int fd_nbupdt = 0;             // number of updates in the 
list
 /* Deletes an FD from the fdsets, and recomputes the maxfd limit.
  * The file descriptor is also closed.
  */
-void fd_delete(int fd)
+static void fd_dodelete(int fd, int do_close)
 {
        if (fdtab[fd].linger_risk) {
                /* this is generally set when connecting to servers */
@@ -190,7 +190,8 @@ void fd_delete(int fd)
 
        port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port);
        fdinfo[fd].port_range = NULL;
-       close(fd);
+       if (do_close)
+               close(fd);
        fdtab[fd].owner = NULL;
        fdtab[fd].new = 0;
 
@@ -198,6 +199,22 @@ void fd_delete(int fd)
                maxfd--;
 }
 
+/* Deletes an FD from the fdsets, and recomputes the maxfd limit.
+ * The file descriptor is also closed.
+ */
+void fd_delete(int fd)
+{
+       fd_dodelete(fd, 1);
+}
+
+/* Deletes an FD from the fdsets, and recomputes the maxfd limit.
+ * The file descriptor is kept open.
+ */
+void fd_remove(int fd)
+{
+       fd_dodelete(fd, 0);
+}
+
 /* Scan and process the cached events. This should be called right after
  * the poller. The loop may cause new entries to be created, for example
  * if a listener causes an accept() to initiate a new incoming connection
diff --git a/src/haproxy.c b/src/haproxy.c
index fabf156..feb0ec3 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -1673,6 +1673,15 @@ void deinit(void)
                }/* end while(s) */
 
                list_for_each_entry_safe(l, l_next, &p->conf.listeners, by_fe) {
+                       /*
+                        * Zombie proxy, the listener just pretend to be up
+                        * because they still hold an opened fd.
+                        * Close it and give the listener its real state.
+                        */
+                       if (p->state == PR_STSTOPPED && l->state >= LI_LISTEN) {
+                               close(l->fd);
+                               l->state = LI_INIT;
+                       }
                        unbind_listener(l);
                        delete_listener(l);
                        LIST_DEL(&l->by_fe);
@@ -2154,7 +2163,7 @@ int main(int argc, char **argv)
                while (px != NULL) {
                        if (px->bind_proc && px->state != PR_STSTOPPED) {
                                if (!(px->bind_proc & (1UL << proc)))
-                                       stop_proxy(px);
+                                       zombify_proxy(px);
                        }
                        px = px->next;
                }
diff --git a/src/listener.c b/src/listener.c
index 7a1df0e..a38d05d 100644
--- a/src/listener.c
+++ b/src/listener.c
@@ -242,12 +242,7 @@ void dequeue_all_listeners(struct list *list)
        }
 }
 
-/* This function closes the listening socket for the specified listener,
- * provided that it's already in a listening state. The listener enters the
- * LI_ASSIGNED state. It always returns ERR_NONE. This function is intended
- * to be used as a generic function for standard protocols.
- */
-int unbind_listener(struct listener *listener)
+static int do_unbind_listener(struct listener *listener, int do_close)
 {
        if (listener->state == LI_READY)
                fd_stop_recv(listener->fd);
@@ -256,13 +251,35 @@ int unbind_listener(struct listener *listener)
                LIST_DEL(&listener->wait_queue);
 
        if (listener->state >= LI_PAUSED) {
-               fd_delete(listener->fd);
-               listener->fd = -1;
+               if (do_close) {
+                       fd_delete(listener->fd);
+                       listener->fd = -1;
+               }
+               else
+                       fd_remove(listener->fd);
                listener->state = LI_ASSIGNED;
        }
        return ERR_NONE;
 }
 
+/* This function closes the listening socket for the specified listener,
+ * provided that it's already in a listening state. The listener enters the
+ * LI_ASSIGNED state. It always returns ERR_NONE. This function is intended
+ * to be used as a generic function for standard protocols.
+ */
+int unbind_listener(struct listener *listener)
+{
+       return do_unbind_listener(listener, 1);
+}
+
+/* This function pretends the listener is dead, but keeps the FD opened, so
+ * that we can provide it, for conf reloading.
+ */
+int unbind_listener_no_close(struct listener *listener)
+{
+       return do_unbind_listener(listener, 0);
+}
+
 /* This function closes all listening sockets bound to the protocol <proto>,
  * and the listeners end in LI_ASSIGNED state if they were higher. It does not
  * detach them from the protocol. It always returns ERR_NONE.
diff --git a/src/proxy.c b/src/proxy.c
index d158fac..9e3f901 100644
--- a/src/proxy.c
+++ b/src/proxy.c
@@ -991,6 +991,19 @@ void soft_stop(void)
        p = proxy;
        tv_update_date(0,1); /* else, the old time before select will be used */
        while (p) {
+               /* Zombie proxy, let's close the file descriptors */
+               if (p->state == PR_STSTOPPED &&
+                   !LIST_ISEMPTY(&p->conf.listeners) &&
+                   LIST_ELEM(p->conf.listeners.n,
+                   struct listener *, by_fe)->state >= LI_LISTEN) {
+                       struct listener *l;
+                       list_for_each_entry(l, &p->conf.listeners, by_fe) {
+                               if (l->state >= LI_LISTEN)
+                                       close(l->fd);
+                               l->state = LI_INIT;
+                       }
+               }
+
                if (p->state != PR_STSTOPPED) {
                        Warning("Stopping %s %s in %d ms.\n", 
proxy_cap_str(p->cap), p->id, p->grace);
                        send_log(p, LOG_WARNING, "Stopping %s %s in %d ms.\n", 
proxy_cap_str(p->cap), p->id, p->grace);
@@ -1051,6 +1064,45 @@ int pause_proxy(struct proxy *p)
        return 1;
 }
 
+/* This function makes the proxy unusable, but keeps the listening sockets
+ * opened, so that if any process requests them, we are able to serve them.
+ * This should only be called early, before we started accepting requests.
+ */
+void zombify_proxy(struct proxy *p)
+{
+       struct listener *l;
+       struct listener *first_to_listen = NULL;
+
+       list_for_each_entry(l, &p->conf.listeners, by_fe) {
+               enum li_state oldstate = l->state;
+
+               unbind_listener_no_close(l);
+               if (l->state >= LI_ASSIGNED) {
+                       delete_listener(l);
+                       listeners--;
+                       jobs--;
+               }
+               if (!first_to_listen && l->state >= LI_LISTEN)
+                       first_to_listen = l;
+               /*
+                * Pretend we're still up and running so that the fd
+                * will be sent if asked.
+                */
+               l->state = oldstate;
+       }
+       /* Quick hack : at stop time, to know we have to close the sockets
+        * despite the proxy being marked as stopped, make the first listener
+        * of the listener list an active one, so that we don't have to
+        * parse the whole list to be sure.
+        */
+       if (first_to_listen && LIST_ELEM(p->conf.listeners.n,
+           struct listener *, by_fe)) {
+               LIST_DEL(&l->by_fe);
+               LIST_ADD(&p->conf.listeners, &l->by_fe);
+       }
+
+       p->state = PR_STSTOPPED;
+}
 
 /*
  * This function completely stops a proxy and releases its listeners. It has
-- 
2.9.3

>From 0c63df73200016c36404baf183c6c7826d8dbe33 Mon Sep 17 00:00:00 2001
From: Olivier Houchard <ohouch...@haproxy.com>
Date: Thu, 6 Apr 2017 14:45:14 +0200
Subject: [PATCH 6/6] MINOR: socket transfer: Set a timeout on the socket.

Make sure we're not stuck forever by setting a timeout on the socket.
---
 src/cli.c     | 2 ++
 src/haproxy.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/cli.c b/src/cli.c
index 37fb9fe..d5ff11f 100644
--- a/src/cli.c
+++ b/src/cli.c
@@ -1025,6 +1025,7 @@ static int _getsocks(char **args, struct appctx *appctx, 
void *private)
        struct connection *remote = objt_conn(si_opposite(si)->end);
        struct msghdr msghdr;
        struct iovec iov;
+       struct timeval tv = { .tv_sec = 1, .tv_usec = 0 };
        int *tmpfd;
        int tot_fd_nb = 0;
        struct proxy *px;
@@ -1049,6 +1050,7 @@ static int _getsocks(char **args, struct appctx *appctx, 
void *private)
                Warning("Cannot make the unix socket blocking\n");
                goto out;
        }
+       setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (void *)&tv, sizeof(tv));
        iov.iov_base = &tot_fd_nb;
        iov.iov_len = sizeof(tot_fd_nb);
        if (!cli_has_level(appctx, ACCESS_LVL_ADMIN))
diff --git a/src/haproxy.c b/src/haproxy.c
index feb0ec3..af86b78 100644
--- a/src/haproxy.c
+++ b/src/haproxy.c
@@ -579,6 +579,7 @@ static int get_old_sockets(const char *unixsocket)
        struct msghdr msghdr;
        struct iovec iov;
        struct xfer_sock_list *xfer_sock = NULL;
+       struct timeval tv = { .tv_sec = 1, .tv_usec = 0 };
        int sock = -1;
        int ret = -1;
        int ret2 = -1;
@@ -608,6 +609,7 @@ static int get_old_sockets(const char *unixsocket)
                    unixsocket);
                goto out;
        }
+       setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (void *)&tv, sizeof(tv));
        iov.iov_base = &fd_nb;
        iov.iov_len = sizeof(fd_nb);
        msghdr.msg_iov = &iov;
-- 
2.9.3

Reply via email to