>From a031cf97da759eb2c2f9b6e191065ad503f821ed Mon Sep 17 00:00:00 2001
From: Anthony Deschamps <anthony.j.descha...@gmail.com>
Date: Fri, 16 Feb 2024 16:00:35 -0500
Subject: [PATCH] MEDIUM: lb-chash: Deterministic node hashes based on server
 address

Motivation: When services are discovered through DNS resolution, the order
in
which DNS records get resolved and assigned to servers is arbitrary.
Therefore,
even though two HAProxy instances using chash balancing might agree that a
particular request should go to server3, it is likely the case that they
have
assigned different IPs and ports to the server in that slot.

By deriving the keys for the chash tree nodes from a server's address and
port
we ensure that independent HAProxy instances will agree on routing
decisions. If
an address is not known then the key is derived from the server's puid as
it was
previously.

When adjusting a server's weight, we now unconditionally remove all nodes
first,
since the server's address (and therefore the desired node keys) may have
changed.
---
 src/lb_chash.c | 71 ++++++++++++++++++++++++++++++++++----------------
 1 file changed, 48 insertions(+), 23 deletions(-)

diff --git a/src/lb_chash.c b/src/lb_chash.c
index 1c8034d89..bd39a07f8 100644
--- a/src/lb_chash.c
+++ b/src/lb_chash.c
@@ -21,8 +21,9 @@
 #include <haproxy/backend.h>
 #include <haproxy/errors.h>
 #include <haproxy/queue.h>
-#include <haproxy/server-t.h>
+#include <haproxy/server.h>
 #include <haproxy/tools.h>
+#include <haproxy/xxhash.h>

 /* Return next tree node after <node> which must still be in the tree, or
be
  * NULL. Lookup wraps around the end to the beginning. If the next node is
the
@@ -58,6 +59,37 @@ static inline void chash_dequeue_srv(struct server *s)
  }
 }

+/* Compute a key that can be used to insert a node into the CHASH tree. If
the
+ * server has a known address then the key will be determined from its
address
+ * and port. This means that independent HAProxy instances will agree on
routing
+ * decisions. If an address is not known then the key will be determined
from
+ * the server's puid.
+ */
+static inline u32 chash_compute_node_key(struct server *s, unsigned
node_index)
+{
+ u32 key;
+ struct server_inetaddr srv_addr;
+
+ server_get_inetaddr(s, &srv_addr);
+
+ switch (srv_addr.family) {
+ case AF_INET:
+ key = full_hash(srv_addr.addr.v4.s_addr);
+ key ^= full_hash(srv_addr.port.svc);
+ key ^= full_hash(node_index);
+ break;
+ case AF_INET6:
+ unsigned seed = (srv_addr.port.svc << 16) + node_index;
+ key = XXH32(srv_addr.addr.v6.s6_addr, 16, seed);
+ break;
+ default:
+ key = full_hash(s->puid * SRV_EWGHT_RANGE + node_index);
+ break;
+ }
+
+ return key;
+}
+
 /* Adjust the number of entries of a server in its tree. The server must
appear
  * as many times as its weight indicates it. If it's there too often, we
remove
  * the last occurrences. If it's not there enough, we add more
occurrences. To
@@ -67,39 +99,32 @@ static inline void chash_dequeue_srv(struct server *s)
  */
 static inline void chash_queue_dequeue_srv(struct server *s)
 {
- while (s->lb_nodes_now > s->next_eweight) {
- if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
- s->lb_nodes_now = s->lb_nodes_tot;
- s->lb_nodes_now--;
- if (s->proxy->lbprm.chash.last == &s->lb_nodes[s->lb_nodes_now].node)
- s->proxy->lbprm.chash.last = chash_skip_node(s->lb_tree,
s->proxy->lbprm.chash.last);
- eb32_delete(&s->lb_nodes[s->lb_nodes_now].node);
- }
+ unsigned int j;
+
+ /* First we need to remove all of the server's entries from its tree
+ * because a) the realloc will change all node pointers and b) the keys
+ * may change.
+ */
+ chash_dequeue_srv(s);

  /* Attempt to increase the total number of nodes, if the user
  * increased the weight beyond the original weight
  */
  if (s->lb_nodes_tot < s->next_eweight) {
  struct tree_occ *new_nodes;
-
- /* First we need to remove all server's entries from its tree
- * because the realloc will change all nodes pointers */
- chash_dequeue_srv(s);
-
  new_nodes = realloc(s->lb_nodes, s->next_eweight * sizeof(*new_nodes));
  if (new_nodes) {
- unsigned int j;
-
  s->lb_nodes = new_nodes;
- memset(&s->lb_nodes[s->lb_nodes_tot], 0,
-    (s->next_eweight - s->lb_nodes_tot) * sizeof(*s->lb_nodes));
- for (j = s->lb_nodes_tot; j < s->next_eweight; j++) {
- s->lb_nodes[j].server = s;
- s->lb_nodes[j].node.key = full_hash(s->puid * SRV_EWGHT_RANGE + j);
- }
  s->lb_nodes_tot = s->next_eweight;
  }
  }
+
+ memset(s->lb_nodes, 0, s->next_eweight * sizeof(*s->lb_nodes));
+ for (j = 0; j < s->next_eweight; j++) {
+ s->lb_nodes[j].server = s;
+ s->lb_nodes[j].node.key = chash_compute_node_key(s, j);
+ }
+
  while (s->lb_nodes_now < s->next_eweight) {
  if (s->lb_nodes_now >= s->lb_nodes_tot) // should always be false anyway
  break;
@@ -507,7 +532,7 @@ int chash_init_server_tree(struct proxy *p)
  }
  for (node = 0; node < srv->lb_nodes_tot; node++) {
  srv->lb_nodes[node].server = srv;
- srv->lb_nodes[node].node.key = full_hash(srv->puid * SRV_EWGHT_RANGE +
node);
+ srv->lb_nodes[node].node.key = chash_compute_node_key(srv, node);
  }

  if (srv_currently_usable(srv))
-- 
2.43.2

Reply via email to