---
include/haproxy/backend-t.h | 3 +-
include/haproxy/lb_chash.h | 1 +
src/cfgparse-listen.c | 3 +
src/cfgparse.c | 5 ++
src/lb_chash.c | 129 ++++++++++++++++++++++++++++++++++--
5 files changed, 135 insertions(+), 6 deletions(-)
diff --git a/include/haproxy/backend-t.h b/include/haproxy/backend-t.h
index 126528400..73cddebe4 100644
--- a/include/haproxy/backend-t.h
+++ b/include/haproxy/backend-t.h
@@ -109,7 +109,8 @@
/* hash types */
#define BE_LB_HASH_MAP 0x000000 /* map-based hash (default) */
#define BE_LB_HASH_CONS 0x100000 /* consistent hashbit to indicate a
dynamic algorithm */
-#define BE_LB_HASH_TYPE 0x100000 /* get/clear hash types */
+#define BE_LB_HASH_CONS2X 0x200000 /* consistent hashbit to indicate a dynamic
algorithm */
+#define BE_LB_HASH_TYPE 0x300000 /* get/clear hash types */
/* additional modifier on top of the hash function (only avalanche right now) */
#define BE_LB_HMOD_AVAL 0x200000 /* avalanche modifier */
diff --git a/include/haproxy/lb_chash.h b/include/haproxy/lb_chash.h
index 79504574b..0aa72793a 100644
--- a/include/haproxy/lb_chash.h
+++ b/include/haproxy/lb_chash.h
@@ -30,6 +30,7 @@ struct server;
int chash_init_server_tree(struct proxy *p);
struct server *chash_get_next_server(struct proxy *p, struct server
*srvtoavoid);
struct server *chash_get_server_hash(struct proxy *p, unsigned int hash,
const struct server *avoid);
+struct server *chash_get_server_hash_c2x(struct proxy *p, unsigned int hash);
#endif /* _HAPROXY_LB_CHASH_H */
diff --git a/src/cfgparse-listen.c b/src/cfgparse-listen.c
index 5deec5e6b..ef95fe5dc 100644
--- a/src/cfgparse-listen.c
+++ b/src/cfgparse-listen.c
@@ -2673,6 +2673,9 @@ int cfg_parse_listen(const char *file, int linenum, char
**args, int kwm)
if (strcmp(args[1], "consistent") == 0) { /* use consistent
hashing */
curproxy->lbprm.algo |= BE_LB_HASH_CONS;
}
+ else if (strcmp(args[1], "consistent-2x") == 0) { /* use
consistent-2x hashing */
+ curproxy->lbprm.algo |= BE_LB_HASH_CONS2X;
+ }
else if (strcmp(args[1], "map-based") == 0) { /* use map-based
hashing */
curproxy->lbprm.algo |= BE_LB_HASH_MAP;
}
diff --git a/src/cfgparse.c b/src/cfgparse.c
index 58067c8ed..3b5d4f956 100644
--- a/src/cfgparse.c
+++ b/src/cfgparse.c
@@ -3474,6 +3474,11 @@ int check_config_validity()
if (chash_init_server_tree(curproxy) < 0) {
cfgerr++;
}
+ } else if ((curproxy->lbprm.algo & BE_LB_HASH_TYPE) ==
BE_LB_HASH_CONS2X) {
+ curproxy->lbprm.algo |= BE_LB_LKUP_CHTREE |
BE_LB_PROP_DYN;
+ if (chash_init_server_tree(curproxy) < 0) {
+ cfgerr++;
+ }
} else {
curproxy->lbprm.algo |= BE_LB_LKUP_MAP;
init_server_map(curproxy);
diff --git a/src/lb_chash.c b/src/lb_chash.c
index 023219c98..d204b1fcc 100644
--- a/src/lb_chash.c
+++ b/src/lb_chash.c
@@ -24,6 +24,10 @@
#include <haproxy/server-t.h>
#include <haproxy/tools.h>
+#define CHASH_NUM_NODES 11587
+#define CHASH_BASE_NUM 1000
+#define CHASH_SRV_BASE(srv) ((int)(srv->puid / CHASH_BASE_NUM))
+
/* Return next tree node after <node> which must still be in the tree, or be
* NULL. Lookup wraps around the end to the beginning. If the next node is the
* same node, return NULL. This is designed to find a valid next node before
@@ -155,7 +159,9 @@ static void chash_set_server_status_down(struct server *srv)
p->srv_act--;
}
- chash_dequeue_srv(srv);
+ if ((p->lbprm.algo & BE_LB_HASH_TYPE) != BE_LB_HASH_CONS2X) {
+ chash_dequeue_srv(srv);
+ }
out_update_backend:
/* check/update tot_used, tot_weight */
@@ -217,7 +223,9 @@ static void chash_set_server_status_up(struct server *srv)
}
/* note that eweight cannot be 0 here */
- chash_queue_dequeue_srv(srv);
+ if ((p->lbprm.algo & BE_LB_HASH_TYPE) != BE_LB_HASH_CONS2X) {
+ chash_queue_dequeue_srv(srv);
+ }
out_update_backend:
/* check/update tot_used, tot_weight */
@@ -268,7 +276,9 @@ static void chash_update_server_weight(struct server *srv)
HA_RWLOCK_WRLOCK(LBPRM_LOCK, &p->lbprm.lock);
/* only adjust the server's presence in the tree */
- chash_queue_dequeue_srv(srv);
+ if ((p->lbprm.algo & BE_LB_HASH_TYPE) != BE_LB_HASH_CONS2X) {
+ chash_queue_dequeue_srv(srv);
+ }
if (srv->flags & SRV_F_BACKUP)
p->lbprm.tot_wbck += srv->next_eweight - srv->cur_eweight;
@@ -309,6 +319,107 @@ int chash_server_is_eligible(struct server *s)
return s->served < slots;
}
+/*
+ * This function returns the running server from the CHASH (2x) tree.
+ * Differently from the traditional chash_get_server_hash() function,
+ * this first picks two servers <primary> and <secondary>,
+ * which are at the closest distance from the value of <hash>.
+ * If both are usable, it randomly chooses between them, taking their
+ * weights into account, and returns it.
+ * If one of them is unusable, it returns the latter.
+ * If both are unusable, then it looks for the next usable server in the
+ * tree.
+ * Keep in mind that when a server goes down, it does not get excluded
+ * from the cluster, staying in the tree: the algorithm itself will
+ * guide its decision towards its primary/secondary.
+ */
+struct server *chash_get_server_hash_c2x(struct proxy *p, unsigned int hash)
+{
+ struct eb32_node *start, *cur;
+ struct server *srv = NULL, *primary = NULL, *secondary = NULL,
*first_usable = NULL;
+ struct eb_root *root;
+
+ if (p->srv_act)
+ root = &p->lbprm.chash.act;
+ else if (p->lbprm.fbck)
+ return p->lbprm.fbck;
+ else if (p->srv_bck)
+ root = &p->lbprm.chash.bck;
+ else
+ return NULL;
+
+ /* find the node after and the node before */
+ cur = eb32_lookup_ge(root, hash);
+ if (!cur)
+ cur = eb32_first(root);
+ if (!cur)
+ return NULL; /* tree is empty */
+
+ start = cur;
+ primary = eb32_entry(cur, struct tree_occ, node)->server;
+ int base_id = CHASH_SRV_BASE(primary);
+
+ // find the secondary replica
+ cur = eb32_next(start);
+ if (!cur) {
+ cur = eb32_first(root);
+ }
+
+ while (cur != start) {
+ srv = eb32_entry(cur, struct tree_occ, node)->server;
+ if (CHASH_SRV_BASE(srv) != base_id) {
+ secondary = srv;
+ break;
+ }
+
+ cur = eb32_next(cur);
+ if (!cur) {
+ cur = eb32_first(root);
+ }
+ }
+
+ int primary_usable = srv_currently_usable (primary);
+ int secondary_usable = secondary ? srv_currently_usable (secondary) : 0;
+
+ if (primary_usable && secondary_usable) {
+ uint64_t rsum = primary->cur_eweight + secondary->cur_eweight;
+ // copied from sample.c:smp_fetch_rand()
+ uint64_t r = ((uint64_t)random() * rsum) /
((uint64_t)RAND_MAX+1);
+ return r < primary->cur_eweight ? primary : secondary;
+ } else if (primary_usable) {
+ return primary;
+ } else if (secondary_usable) {
+ return secondary;
+ } else {
+ // find a usable server, possibly with different base,
regardless of the weight
+ cur = eb32_next(start);
+ if (!cur) {
+ cur = eb32_first(root);
+ }
+
+ while (cur != start) {
+ srv = eb32_entry(cur, struct tree_occ, node)->server;
+ if (srv_currently_usable(srv)) {
+ if (CHASH_SRV_BASE(srv) != base_id) {
+ return srv;
+ } else if (!first_usable) {
+ first_usable = srv;
+ }
+ }
+
+ cur = eb32_next(cur);
+ if (!cur) {
+ cur = eb32_first(root);
+ }
+ }
+ }
+
+ // at this point we couldn't find any replica with different base
+ // just return the first usable server, if any
+
+ return first_usable;
+}
+
/*
* This function returns the running server from the CHASH tree, which is at
* the closest distance from the value of <hash>. Doing so ensures that even
@@ -326,6 +437,10 @@ struct server *chash_get_server_hash(struct proxy *p,
unsigned int hash, const s
struct eb_root *root;
unsigned int dn, dp;
int loop;
+ if ((p->lbprm.algo & BE_LB_HASH_TYPE) == BE_LB_HASH_CONS2X) {
+ nsrv = chash_get_server_hash_c2x(p, hash);
+ return nsrv;
+ }
HA_RWLOCK_RDLOCK(LBPRM_LOCK, &p->lbprm.lock);
@@ -497,7 +612,11 @@ int chash_init_server_tree(struct proxy *p)
/* queue active and backup servers in two distinct groups */
for (srv = p->srv; srv; srv = srv->next) {
srv->lb_tree = (srv->flags & SRV_F_BACKUP) ? &p->lbprm.chash.bck :
&p->lbprm.chash.act;
- srv->lb_nodes_tot = srv->uweight * BE_WEIGHT_SCALE;
+ if ((p->lbprm.algo & BE_LB_HASH_TYPE) == BE_LB_HASH_CONS2X) {
+ srv->lb_nodes_tot = CHASH_NUM_NODES;
+ } else {
+ srv->lb_nodes_tot = srv->uweight * BE_WEIGHT_SCALE;
+ }
srv->lb_nodes_now = 0;
srv->lb_nodes = calloc(srv->lb_nodes_tot,
sizeof(*srv->lb_nodes));
@@ -510,7 +629,7 @@ int chash_init_server_tree(struct proxy *p)
srv->lb_nodes[node].node.key = full_hash(srv->puid *
SRV_EWGHT_RANGE + node);
}
- if (srv_currently_usable(srv))
+ if (((p->lbprm.algo & BE_LB_HASH_TYPE) == BE_LB_HASH_CONS2X) ||
srv_currently_usable(srv))
chash_queue_dequeue_srv(srv);
}
return 0;
--
2.30.2