Hi Willy, Those changes are easy enough to make, so I've attached the patch again with those changes. I had to make a few small adjustments to the commit message anyway (some things that changed as a result of reviewing this path). Let me know if there's anything else that I can help with.
Thank you, Anthony
From 30896b2f6368cd022bfb574d4d5f647483b79a59 Mon Sep 17 00:00:00 2001 From: Anthony Deschamps <anthony.j.descha...@gmail.com> Date: Fri, 16 Feb 2024 16:00:35 -0500 Subject: [PATCH] MEDIUM: lb-chash: Deterministic node hashes based on server address Motivation: When services are discovered through DNS resolution, the order in which DNS records get resolved and assigned to servers is arbitrary. Therefore, even though two HAProxy instances using chash balancing might agree that a particular request should go to server3, it is likely the case that they have assigned different IPs and ports to the server in that slot. This patch adds a server option, "hash-key <key>" which can be set to "id" (the existing behaviour, default), "addr", or "addr-port". By deriving the keys for the chash tree nodes from a server's address and port we ensure that independent HAProxy instances will agree on routing decisions. If an address is not known then the key is derived from the server's puid as it was previously. When adjusting a server's weight, we now check whether the server's hash has changed. If it has, we have to remove all its nodes first, since the node keys will also have to change. --- doc/configuration.txt | 25 +++++++++++ include/haproxy/server-t.h | 9 ++++ src/lb_chash.c | 91 +++++++++++++++++++++++++++++++++++--- src/server.c | 31 +++++++++++++ 4 files changed, 150 insertions(+), 6 deletions(-) diff --git a/doc/configuration.txt b/doc/configuration.txt index 1065e6098..0d5649d34 100644 --- a/doc/configuration.txt +++ b/doc/configuration.txt @@ -4813,6 +4813,8 @@ error-log-format X X X - force-persist - - X X filter - X X X fullconn X - X X +hash-balance-factor X - X X +hash-key X - X X hash-type X - X X http-after-response X (!) X X X http-check comment X - X X @@ -6606,6 +6608,29 @@ hash-balance-factor <factor> See also : "balance" and "hash-type". +hash-key <key> + Specify how "hash-type consistent" node keys are computed + + Arguments : + <key> <key> may be one of the following : + + id The node keys will be derived from the server's numeric + identifier as set from "id" or which defaults to its position + in the server list. + + addr The node keys will be derived from the server's address, when + available, or else fall back on "id". + + addr-port The node keys will be derived from the server's address and + port, when available, or else fall back on "id". + + The "addr" and "addr-port" options may be useful in scenarios where multiple + HAProxy processes are balancing traffic to the same set of servers. If the + server order of each process is different (because, for example, DNS records + were resolved in different orders) then this will allow each independent + HAProxy processes to agree on routing decisions. + + hash-type <method> <function> <modifier> Specify a method to use for mapping hashes to servers diff --git a/include/haproxy/server-t.h b/include/haproxy/server-t.h index f077ff2f8..811eae712 100644 --- a/include/haproxy/server-t.h +++ b/include/haproxy/server-t.h @@ -223,6 +223,13 @@ struct pid_list { int exited; }; +/* srv methods of computing chash keys */ +enum srv_hash_key { + SRV_HASH_KEY_ID = 0, /* derived from server puid */ + SRV_HASH_KEY_ADDR, /* derived from server address */ + SRV_HASH_KEY_ADDR_PORT /* derived from server address and port */ +}; + /* A tree occurrence is a descriptor of a place in a tree, with a pointer back * to the server itself. */ @@ -366,6 +373,8 @@ struct server { struct tree_occ *lb_nodes; /* lb_nodes_tot * struct tree_occ */ unsigned lb_nodes_tot; /* number of allocated lb_nodes (C-HASH) */ unsigned lb_nodes_now; /* number of lb_nodes placed in the tree (C-HASH) */ + enum srv_hash_key hash_key; /* method to compute node hash (C-HASH) */ + unsigned lb_server_key; /* hash of the values indicated by "hash_key" (C-HASH) */ const struct netns_entry *netns; /* contains network namespace name or NULL. Network namespace comes from configuration */ struct xprt_ops *xprt; /* transport-layer operations */ diff --git a/src/lb_chash.c b/src/lb_chash.c index 1c8034d89..e572c574e 100644 --- a/src/lb_chash.c +++ b/src/lb_chash.c @@ -21,8 +21,9 @@ #include <haproxy/backend.h> #include <haproxy/errors.h> #include <haproxy/queue.h> -#include <haproxy/server-t.h> +#include <haproxy/server.h> #include <haproxy/tools.h> +#include <haproxy/xxhash.h> /* Return next tree node after <node> which must still be in the tree, or be * NULL. Lookup wraps around the end to the beginning. If the next node is the @@ -58,6 +59,77 @@ static inline void chash_dequeue_srv(struct server *s) } } +/* Compute a key that can be used to insert a node into the CHASH tree. Servers + * have a base key, which can be computed in several ways (see + * chash_compute_server_key) and this function uses that seed to generate hash + * keys for however many nodes need to be inserted into the tree. + */ +static inline u32 chash_compute_node_key(struct server *s, unsigned node_index) +{ + return full_hash(s->lb_server_key + node_index); +} + +/* Compute the base server key that will be used to compute node keys. Servers + * may be configured to determine their hashes either from their ID, address, or + * address+port; the latter options allow independent HAProxy instances to agree + * on routing decisions, regardless of their order in the server list (which may + * be arbitrary, since it could depend on factors such as the order of entries + * in a DNS SRV record). If an address is not known or if the server is + * configured with `hash-key id` (the default) then the key will be determined + * from the server's puid. + */ +static inline u32 chash_compute_server_key(struct server *s) +{ + enum srv_hash_key hash_key = s->hash_key; + struct server_inetaddr srv_addr; + u32 key; + + /* If hash-key is addr or addr-port then we need the address, but if we + * can't determine the address then we fall back on hashing the puid. + */ + switch (hash_key) { + case SRV_HASH_KEY_ADDR: + case SRV_HASH_KEY_ADDR_PORT: + server_get_inetaddr(s, &srv_addr); + if (srv_addr.family != AF_INET && srv_addr.family != AF_INET6) { + hash_key = SRV_HASH_KEY_ID; + } + break; + + default: + break; + } + + if (hash_key == SRV_HASH_KEY_ADDR_PORT) { + key = full_hash(srv_addr.port.svc); + } else { + key = 0; + } + + switch (hash_key) { + case SRV_HASH_KEY_ADDR_PORT: + case SRV_HASH_KEY_ADDR: + switch (srv_addr.family) { + case AF_INET: + key = full_hash(key + srv_addr.addr.v4.s_addr); + break; + case AF_INET6: + key = XXH32(srv_addr.addr.v6.s6_addr, 16, key); + break; + default: + break; + } + break; + + case SRV_HASH_KEY_ID: + default: + key = full_hash(s->puid); + break; + } + + return key; +} + /* Adjust the number of entries of a server in its tree. The server must appear * as many times as its weight indicates it. If it's there too often, we remove * the last occurrences. If it's not there enough, we add more occurrences. To @@ -67,6 +139,15 @@ static inline void chash_dequeue_srv(struct server *s) */ static inline void chash_queue_dequeue_srv(struct server *s) { + u32 server_key = chash_compute_server_key(s); + + /* If the server key changed then we must rehash all the nodes. */ + if (server_key != s->lb_server_key) { + chash_dequeue_srv(s); + s->lb_nodes_tot = 0; + s->lb_server_key = server_key; + } + while (s->lb_nodes_now > s->next_eweight) { if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway s->lb_nodes_now = s->lb_nodes_tot; @@ -95,7 +176,7 @@ static inline void chash_queue_dequeue_srv(struct server *s) (s->next_eweight - s->lb_nodes_tot) * sizeof(*s->lb_nodes)); for (j = s->lb_nodes_tot; j < s->next_eweight; j++) { s->lb_nodes[j].server = s; - s->lb_nodes[j].node.key = full_hash(s->puid * SRV_EWGHT_RANGE + j); + s->lb_nodes[j].node.key = chash_compute_node_key(s, j); } s->lb_nodes_tot = s->next_eweight; } @@ -238,9 +319,6 @@ static void chash_update_server_weight(struct server *srv) int old_state, new_state; struct proxy *p = srv->proxy; - if (!srv_lb_status_changed(srv)) - return; - /* If changing the server's weight changes its state, we simply apply * the procedures we already have for status change. If the state * remains down, the server is not in any tree, so it's as easy as @@ -505,9 +583,10 @@ int chash_init_server_tree(struct proxy *p) ha_alert("failed to allocate lb_nodes for server %s.\n", srv->id); return -1; } + srv->lb_server_key = chash_compute_server_key(srv); for (node = 0; node < srv->lb_nodes_tot; node++) { srv->lb_nodes[node].server = srv; - srv->lb_nodes[node].node.key = full_hash(srv->puid * SRV_EWGHT_RANGE + node); + srv->lb_nodes[node].node.key = chash_compute_node_key(srv, node); } if (srv_currently_usable(srv)) diff --git a/src/server.c b/src/server.c index df24612f5..f5ffeb830 100644 --- a/src/server.c +++ b/src/server.c @@ -184,6 +184,9 @@ static void _srv_set_inetaddr_port(struct server *srv, else srv->flags &= ~SRV_F_MAPPORTS; + /* balancers (chash in particular) may use the addr in their routing decisions */ + srv->proxy->lbprm.update_server_eweight(srv); + if (srv->log_target && srv->log_target->type == LOG_TARGET_DGRAM) { /* server is used as a log target, manually update log target addr for DGRAM */ ipcpy(addr, srv->log_target->addr); @@ -948,6 +951,32 @@ static int srv_parse_ws(char **args, int *cur_arg, return 0; } +/* Parse the "hash-key" server keyword */ +static int srv_parse_hash_key(char **args, int *cur_arg, + struct proxy *curproxy, struct server *newsrv, char **err) +{ + if (!args[*cur_arg + 1]) { + memprintf(err, "'%s expects 'id', 'addr', or 'addr-port' value", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + if (strcmp(args[*cur_arg + 1], "id") == 0) { + newsrv->hash_key = SRV_HASH_KEY_ID; + } + else if (strcmp(args[*cur_arg + 1], "addr") == 0) { + newsrv->hash_key = SRV_HASH_KEY_ADDR; + } + else if (strcmp(args[*cur_arg + 1], "addr-port") == 0) { + newsrv->hash_key = SRV_HASH_KEY_ADDR_PORT; + } + else { + memprintf(err, "'%s' has to be 'id', 'addr', or 'addr-port'", args[*cur_arg]); + return ERR_ALERT | ERR_FATAL; + } + + return 0; +} + /* Parse the "init-addr" server keyword */ static int srv_parse_init_addr(char **args, int *cur_arg, struct proxy *curproxy, struct server *newsrv, char **err) @@ -2221,6 +2250,7 @@ static struct srv_kw_list srv_kws = { "ALL", { }, { { "enabled", srv_parse_enabled, 0, 1, 1 }, /* Start the server in 'enabled' state */ { "error-limit", srv_parse_error_limit, 1, 1, 1 }, /* Configure the consecutive count of check failures to consider a server on error */ { "ws", srv_parse_ws, 1, 1, 1 }, /* websocket protocol */ + { "hash-key", srv_parse_hash_key, 1, 1, 1 }, /* Configure how chash keys are computed */ { "id", srv_parse_id, 1, 0, 1 }, /* set id# of server */ { "init-addr", srv_parse_init_addr, 1, 1, 0 }, /* */ { "log-bufsize", srv_parse_log_bufsize, 1, 1, 0 }, /* Set the ring bufsize for log server (only for log backends) */ @@ -2664,6 +2694,7 @@ void srv_settings_cpy(struct server *srv, const struct server *src, int srv_tmpl srv->minconn = src->minconn; srv->maxconn = src->maxconn; srv->slowstart = src->slowstart; + srv->hash_key = src->hash_key; srv->observe = src->observe; srv->onerror = src->onerror; srv->onmarkeddown = src->onmarkeddown; -- 2.44.0