> I'm not sure the scripts will help me (at least :-)). I was thinking that
> a test could just be "set server XXX addr YYY" on the CLI to change the
> server's address and verify that the hashing follows the address not the
> server number.

Oh, good point -- I wasn't testing what happens when changing the address
via the CLI. It turns out I wasn't handling that case. I added a call in
_srv_set_inetaddr_port() to update the hashes, and tested that it works.

I also moved the docs and added an entry to the table.

Here's the updated patch.

Thanks!
Anthony

>From 3412e86c222ad7b231d687f1402fbe30de312974 Mon Sep 17 00:00:00 2001
From: Anthony Deschamps <anthony.j.descha...@gmail.com>
Date: Fri, 16 Feb 2024 16:00:35 -0500
Subject: [PATCH] MEDIUM: lb-chash: Deterministic node hashes based on server
 address

Motivation: When services are discovered through DNS resolution, the order
in
which DNS records get resolved and assigned to servers is arbitrary.
Therefore,
even though two HAProxy instances using chash balancing might agree that a
particular request should go to server3, it is likely the case that they
have
assigned different IPs and ports to the server in that slot.

This patch adds a server option, "hash-key <key>" which can be set to "id"
(the
existing behaviour, defaut), "addr", or "addr-port". By deriving the keys
for
the chash tree nodes from a server's address and port we ensure that
independent
HAProxy instances will agree on routing decisions. If an address is not
known
then the key is derived from the server's puid as it was previously.

When adjusting a server's weight, we now unconditionally remove all nodes
first,
since the server's address (and therefore the desired node keys) may have
changed.
---
 doc/configuration.txt      |  24 +++++++++
 include/haproxy/server-t.h |   8 +++
 src/lb_chash.c             | 101 +++++++++++++++++++++++++++----------
 src/server.c               |  31 ++++++++++++
 4 files changed, 138 insertions(+), 26 deletions(-)

diff --git a/doc/configuration.txt b/doc/configuration.txt
index 1065e6098..052d41f9f 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -4813,6 +4813,8 @@ error-log-format                          X
 X         X         -
 force-persist                             -          -         X         X
 filter                                    -          X         X         X
 fullconn                                  X          -         X         X
+hash-balance-factor                       X          -         X         X
+hash-key                                  X          -         X         X
 hash-type                                 X          -         X         X
 http-after-response                       X (!)      X         X         X
 http-check comment                        X          -         X         X
@@ -6606,6 +6608,28 @@ hash-balance-factor <factor>
   See also : "balance" and "hash-type".


+hash-key <key>
+  Specify how "hash-type consistent" node keys are computed
+
+  Arguments :
+    <key>   <key> may be one of the following :
+
+      id         The node keys will be derived from the server's position
in
+                 the server list.
+
+      addr       The node keys will be derived from the server's address,
when
+                 available, or else fall back on "id".
+
+      addr-port  The node keys will be derived from the server's address
and
+                 port, when available, or else fall back on "id".
+
+  The "addr" and "addr-port" options may be useful in scenarios where
multiple
+  HAProxy processes are balancing traffic to the same set of servers. If
the
+  server order of each process is different (because, for example, DNS
records
+  were resolved in different orders) then this will allow each independent
+  HAProxy processes to agree on routing decisions.
+
+
 hash-type <method> <function> <modifier>
   Specify a method to use for mapping hashes to servers

diff --git a/include/haproxy/server-t.h b/include/haproxy/server-t.h
index f077ff2f8..1a2735c2a 100644
--- a/include/haproxy/server-t.h
+++ b/include/haproxy/server-t.h
@@ -223,6 +223,13 @@ struct pid_list {
  int exited;
 };

+/* srv methods of computing chash keys */
+enum srv_hash_key {
+ SRV_HASH_KEY_ID = 0,         /* derived from server puid */
+ SRV_HASH_KEY_ADDR,           /* derived from server address */
+ SRV_HASH_KEY_ADDR_PORT       /* derived from server address and port */
+};
+
 /* A tree occurrence is a descriptor of a place in a tree, with a pointer
back
  * to the server itself.
  */
@@ -366,6 +373,7 @@ struct server {
  struct tree_occ *lb_nodes;              /* lb_nodes_tot * struct tree_occ
*/
  unsigned lb_nodes_tot;                  /* number of allocated lb_nodes
(C-HASH) */
  unsigned lb_nodes_now;                  /* number of lb_nodes placed in
the tree (C-HASH) */
+ enum srv_hash_key hash_key;             /* method to compute node hash
(C-HASH) */

  const struct netns_entry *netns;        /* contains network namespace
name or NULL. Network namespace comes from configuration */
  struct xprt_ops *xprt;                  /* transport-layer operations */
diff --git a/src/lb_chash.c b/src/lb_chash.c
index 1c8034d89..f3cce451f 100644
--- a/src/lb_chash.c
+++ b/src/lb_chash.c
@@ -21,8 +21,9 @@
 #include <haproxy/backend.h>
 #include <haproxy/errors.h>
 #include <haproxy/queue.h>
-#include <haproxy/server-t.h>
+#include <haproxy/server.h>
 #include <haproxy/tools.h>
+#include <haproxy/xxhash.h>

 /* Return next tree node after <node> which must still be in the tree, or
be
  * NULL. Lookup wraps around the end to the beginning. If the next node is
the
@@ -58,6 +59,64 @@ static inline void chash_dequeue_srv(struct server *s)
  }
 }

+/* Compute a key that can be used to insert a node into the CHASH tree.
Servers
+ * may be configured to determine their hashes from their address, so that
+ * independent HAProxy instances will agree on routing decisions. If an
address
+ * is not known or if the server is configured with `hash-key id` (the
default)
+ * then the key will be determined from the server's puid.
+ */
+static inline u32 chash_compute_node_key(struct server *s, unsigned
node_index)
+{
+ u32 key = 0;
+ struct server_inetaddr srv_addr;
+
+ enum srv_hash_key hash_key = s->hash_key;
+
+ /* If hash-key is addr or addr-port then we need the address, but if we
+ * can't determine the address then we fall back on hashing the puid.
+ */
+ switch (s->hash_key) {
+ case SRV_HASH_KEY_ADDR:
+ case SRV_HASH_KEY_ADDR_PORT:
+ server_get_inetaddr(s, &srv_addr);
+ if (srv_addr.family != AF_INET && srv_addr.family != AF_INET6) {
+  hash_key = SRV_HASH_KEY_ID;
+ }
+ break;
+
+ default:
+ break;
+ }
+
+ if (hash_key == SRV_HASH_KEY_ADDR_PORT) {
+ key = full_hash(srv_addr.port.svc);
+ }
+
+ switch (hash_key) {
+ case SRV_HASH_KEY_ADDR_PORT:
+ case SRV_HASH_KEY_ADDR:
+ switch (srv_addr.family) {
+ case AF_INET:
+ key ^= full_hash(srv_addr.addr.v4.s_addr);
+ key ^= full_hash(node_index);
+ break;
+ case AF_INET6:
+ key ^= XXH32(srv_addr.addr.v6.s6_addr, 16, node_index);
+ break;
+ default:
+ break;
+ }
+ break;
+
+ case SRV_HASH_KEY_ID:
+ default:
+ key = full_hash(s->puid * SRV_EWGHT_RANGE + node_index);
+ break;
+ }
+
+ return key;
+}
+
 /* Adjust the number of entries of a server in its tree. The server must
appear
  * as many times as its weight indicates it. If it's there too often, we
remove
  * the last occurrences. If it's not there enough, we add more
occurrences. To
@@ -67,39 +126,32 @@ static inline void chash_dequeue_srv(struct server *s)
  */
 static inline void chash_queue_dequeue_srv(struct server *s)
 {
- while (s->lb_nodes_now > s->next_eweight) {
- if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
- s->lb_nodes_now = s->lb_nodes_tot;
- s->lb_nodes_now--;
- if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node)
- s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree,
s->proxy->lbprm.chash.last);
- eb32_delete(&s->lb_nodes[s->lb_nodes_now].node);
- }
+ unsigned int j;
+
+ /* First we need to remove all of the server's entries from its tree
+ * because a) the realloc will change all node pointers and b) the keys
+ * may change.
+ */
+ chash_dequeue_srv(s);

  /* Attempt to increase the total number of nodes, if the user
  * increased the weight beyond the original weight
  */
  if (s->lb_nodes_tot < s->next_eweight) {
  struct tree_occ *new_nodes;
-
- /* First we need to remove all server's entries from its tree
- * because the realloc will change all nodes pointers */
- chash_dequeue_srv(s);
-
  new_nodes = realloc(s->lb_nodes, s->next_eweight * sizeof(*new_nodes));
  if (new_nodes) {
- unsigned int j;
-
  s->lb_nodes = new_nodes;
- memset(&s->lb_nodes[s->lb_nodes_tot], 0,
-    (s->next_eweight - s->lb_nodes_tot) * sizeof(*s->lb_nodes));
- for (j = s->lb_nodes_tot; j < s->next_eweight; j++) {
- s->lb_nodes[j].server = s;
- s->lb_nodes[j].node.key = full_hash(s->puid * SRV_EWGHT_RANGE + j);
- }
  s->lb_nodes_tot = s->next_eweight;
  }
  }
+
+ memset(s->lb_nodes, 0, s->next_eweight * sizeof(*s->lb_nodes));
+ for (j = 0; j < s->next_eweight; j++) {
+ s->lb_nodes[j].server = s;
+ s->lb_nodes[j].node.key = chash_compute_node_key(s, j);
+ }
+
  while (s->lb_nodes_now < s->next_eweight) {
  if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
  break;
@@ -238,9 +290,6 @@ static void chash_update_server_weight(struct server
*srv)
  int old_state, new_state;
  struct proxy *p = srv->proxy;

- if (!srv_lb_status_changed(srv))
- return;
-
  /* If changing the server's weight changes its state, we simply apply
  * the procedures we already have for status change. If the state
  * remains down, the server is not in any tree, so it's as easy as
@@ -507,7 +556,7 @@ int chash_init_server_tree(struct proxy *p)
  }
  for (node = 0; node < srv->lb_nodes_tot; node++) {
  srv->lb_nodes[node].server = srv;
- srv->lb_nodes[node].node.key = full_hash(srv->puid * SRV_EWGHT_RANGE +
node);
+ srv->lb_nodes[node].node.key = chash_compute_node_key(srv, node);
  }

  if (srv_currently_usable(srv))
diff --git a/src/server.c b/src/server.c
index df24612f5..947319299 100644
--- a/src/server.c
+++ b/src/server.c
@@ -184,6 +184,9 @@ static void _srv_set_inetaddr_port(struct server *srv,
  else
  srv->flags &= ~SRV_F_MAPPORTS;

+        /* balancers (chash in particular) may use the addr in their
routing decisions */
+        srv->proxy->lbprm.update_server_eweight(srv);
+
  if (srv->log_target && srv->log_target->type == LOG_TARGET_DGRAM) {
  /* server is used as a log target, manually update log target addr for
DGRAM */
  ipcpy(addr, srv->log_target->addr);
@@ -948,6 +951,32 @@ static int srv_parse_ws(char **args, int *cur_arg,
  return 0;
 }

+/* Parse the "hash-key" server keyword */
+static int srv_parse_hash_key(char **args, int *cur_arg,
+      struct proxy *curproxy, struct server *newsrv, char **err)
+{
+ if (!args[*cur_arg + 1]) {
+ memprintf(err, "'%s expects 'id', 'addr', or 'addr-port' value",
args[*cur_arg]);
+ return ERR_ALERT | ERR_FATAL;
+ }
+
+ if (strcmp(args[*cur_arg + 1], "id") == 0) {
+ newsrv->hash_key = SRV_HASH_KEY_ID;
+ }
+ else if (strcmp(args[*cur_arg + 1], "addr") == 0) {
+ newsrv->hash_key = SRV_HASH_KEY_ADDR;
+ }
+ else if (strcmp(args[*cur_arg + 1], "addr-port") == 0) {
+ newsrv->hash_key = SRV_HASH_KEY_ADDR_PORT;
+ }
+ else {
+ memprintf(err, "'%s' has to be 'id', 'addr', or 'addr-port'",
args[*cur_arg]);
+ return ERR_ALERT | ERR_FATAL;
+ }
+
+ return 0;
+}
+
 /* Parse the "init-addr" server keyword */
 static int srv_parse_init_addr(char **args, int *cur_arg,
                                struct proxy *curproxy, struct server
*newsrv, char **err)
@@ -2221,6 +2250,7 @@ static struct srv_kw_list srv_kws = { "ALL", { }, {
  { "enabled",              srv_parse_enabled,              0,  1,  1 }, /*
Start the server in 'enabled' state */
  { "error-limit",          srv_parse_error_limit,          1,  1,  1 }, /*
Configure the consecutive count of check failures to consider a server on
error */
  { "ws",                   srv_parse_ws,                   1,  1,  1 }, /*
websocket protocol */
+ { "hash-key",             srv_parse_hash_key,             1,  1,  1 }, /*
Configure how chash keys are computed */
  { "id",                   srv_parse_id,                   1,  0,  1 }, /*
set id# of server */
  { "init-addr",            srv_parse_init_addr,            1,  1,  0 }, /*
*/
  { "log-bufsize",          srv_parse_log_bufsize,          1,  1,  0 }, /*
Set the ring bufsize for log server (only for log backends) */
@@ -2664,6 +2694,7 @@ void srv_settings_cpy(struct server *srv, const
struct server *src, int srv_tmpl
  srv->minconn                  = src->minconn;
  srv->maxconn                  = src->maxconn;
  srv->slowstart                = src->slowstart;
+ srv->hash_key                 = src->hash_key;
  srv->observe                  = src->observe;
  srv->onerror                  = src->onerror;
  srv->onmarkeddown             = src->onmarkeddown;
-- 
2.44.0

Reply via email to