Hi Jon,
Please take a look at v2. The mechanism looks the same as I did before in
commit:
c55c8edafa91 ("tipc: smooth change between replicast and broadcast")
However, in this case we handle only one direction: replicast -> broadcast.
Then, it is still backward compatible.
[...]
>From ae2ee6a7064de3ec1dc2c7df2db241d22b0d129f Mon Sep 17 00:00:00 2001
From: Hoang Le <[email protected]>
Date: Wed, 13 Nov 2019 14:01:03 +0700
Subject: [PATCH] tipc: update a binding service via broadcast
Currently, updating binding table (add service binding to
name table/withdraw a service binding) is being sent over replicast.
However, if we are scaling up clusters to > 100 nodes/containers this
method is less affection because of looping through nodes in a cluster one
by one.
It is worth to use broadcast to update a binding service. Then binding
table updates in all nodes for one shot.
The mechanism is backward compatible because of sending side changing.
v2: resolve synchronization problem when switching from unicast to
broadcast
Signed-off-by: Hoang Le <[email protected]>
---
net/tipc/bcast.c | 13 +++++++++++++
net/tipc/bcast.h | 2 ++
net/tipc/link.c | 16 ++++++++++++++++
net/tipc/name_distr.c | 8 ++++++++
net/tipc/name_table.c | 9 ++++++---
5 files changed, 45 insertions(+), 3 deletions(-)
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index f41096a759fa..18431fa897ab 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -843,3 +843,16 @@ void tipc_mcast_filter_msg(struct net *net, struct
sk_buff_head *defq,
__skb_queue_tail(inputq, _skb);
}
}
+
+int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb)
+{
+ struct sk_buff_head xmitq;
+ u16 cong_link_cnt;
+ int rc = 0;
+
+ __skb_queue_head_init(&xmitq);
+ __skb_queue_tail(&xmitq, skb);
+ rc = tipc_bcast_xmit(net, &xmitq, &cong_link_cnt);
+ __skb_queue_purge(&xmitq);
+ return rc;
+}
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index dadad953e2be..a100da3800fc 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -101,6 +101,8 @@ int tipc_bclink_reset_stats(struct net *net);
u32 tipc_bcast_get_broadcast_mode(struct net *net);
u32 tipc_bcast_get_broadcast_ratio(struct net *net);
+int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb);
+
void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq,
struct sk_buff_head *inputq);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index fb72031228c9..22f1854435df 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -187,6 +187,9 @@ struct tipc_link {
struct tipc_link *bc_sndlink;
u8 nack_state;
bool bc_peer_is_up;
+ bool named_sync;
+ struct sk_buff_head defer_namedq;
+
/* Statistics */
struct tipc_stats stats;
@@ -363,6 +366,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l,
trace_tipc_link_reset(rcv_l, TIPC_DUMP_ALL, "bclink removed!");
tipc_link_reset(rcv_l);
rcv_l->state = LINK_RESET;
+ rcv_l->named_sync = false;
if (!snd_l->ackers) {
trace_tipc_link_reset(snd_l, TIPC_DUMP_ALL, "zero ackers!");
tipc_link_reset(snd_l);
@@ -508,6 +512,7 @@ bool tipc_link_create(struct net *net, char *if_name, int
bearer_id,
__skb_queue_head_init(&l->failover_deferdq);
skb_queue_head_init(&l->wakeupq);
skb_queue_head_init(l->inputq);
+ __skb_queue_head_init(&l->defer_namedq);
return true;
}
@@ -932,6 +937,8 @@ void tipc_link_reset(struct tipc_link *l)
l->silent_intv_cnt = 0;
l->rst_cnt = 0;
l->bc_peer_is_up = false;
+ l->named_sync = false;
+ __skb_queue_purge(&l->defer_namedq);
memset(&l->mon_state, 0, sizeof(l->mon_state));
tipc_link_reset_stats(l);
}
@@ -1210,6 +1217,15 @@ static bool tipc_data_input(struct tipc_link *l, struct
sk_buff *skb,
return true;
case NAME_DISTRIBUTOR:
l->bc_rcvlink->state = LINK_ESTABLISHED;
+ if (msg_is_syn(hdr)) {
+ l->bc_rcvlink->named_sync = true;
+ skb_queue_splice_tail_init(&l->defer_namedq, l->namedq);
+ return true;
+ }
+ if (msg_is_rcast(hdr) && !l->bc_rcvlink->named_sync) {
+ skb_queue_tail(&l->defer_namedq, skb);
+ return true;
+ }
skb_queue_tail(l->namedq, skb);
return true;
case MSG_BUNDLER:
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index 5feaf3b67380..419b3f0f102d 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -180,6 +180,14 @@ static void named_distribute(struct net *net, struct
sk_buff_head *list,
skb_trim(skb, INT_H_SIZE + (msg_dsz - msg_rem));
__skb_queue_tail(list, skb);
}
+
+ /* Allocate dummy message in order to synchronize w/bcast */
+ skb = named_prepare_buf(net, PUBLICATION, 0, dnode);
+ if (skb) {
+ /* Preparing for 'synching' header */
+ msg_set_syn(buf_msg(skb), 1);
+ __skb_queue_tail(list, skb);
+ }
}
/**
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 66a65c2cdb23..4ba6d73e5c4c 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -632,8 +632,10 @@ struct publication *tipc_nametbl_publish(struct net *net,
u32 type, u32 lower,
exit:
spin_unlock_bh(&tn->nametbl_lock);
- if (skb)
- tipc_node_broadcast(net, skb);
+ if (skb) {
+ msg_set_is_rcast(buf_msg(skb), true);
+ tipc_bcast_named_publish(net, skb);
+ }
return p;
}
@@ -664,7 +666,8 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32
lower,
spin_unlock_bh(&tn->nametbl_lock);
if (skb) {
- tipc_node_broadcast(net, skb);
+ msg_set_is_rcast(buf_msg(skb), true);
+ tipc_bcast_named_publish(net, skb);
return 1;
}
return 0;
--
2.20.1
-----Original Message-----
From: Jon Maloy <[email protected]>
Sent: Wednesday, November 13, 2019 7:02 PM
To: Hoang Huu Le <[email protected]>; [email protected];
[email protected]
Cc: 'Ying Xue' <[email protected]>
Subject: RE: [net-next] tipc: update a binding service via broadcast
Hi Hoang,
This is good, but you have missed the point about the synchronization problem I
have been talking about.
1) A new node comes up
2) The "bulk" binding table update is sent, as a series of packets over the new
unicast link. This may take some time.
3) The owner of one of the bindings in the bulk (on this node) does unbind.
4) This is sent as broadcast withdraw to all nodes, and arrives before the last
packets of the unicast bulk to the newly connected node.
5) Since there is no corresponding publication in the peer node's binding table
yet, the withdraw is ignored.
6) The last bulk unicasts arrive at the new peer, and the now invalid
publication is added to its binding table.
7) This publication will stay there forever.
We need to find a way to synchronize so that we know that all the bulk
publications are in place in the binding table before any broadcast
publications/withdraws can be accepted.
Obviously, we could create a backlog queue in the name table, but I hope we can
find a simpler and neater solution.
Regards
///jon
> -----Original Message-----
> From: Hoang Le <[email protected]>
> Sent: 13-Nov-19 02:35
> To: Jon Maloy <[email protected]>; [email protected];
> [email protected]
> Subject: [net-next] tipc: update a binding service via broadcast
>
> Currently, updating binding table (add service binding to
> name table/withdraw a service binding) is being sent over replicast.
> However, if we are scaling up clusters to > 100 nodes/containers this
> method is less affection because of looping through nodes in a cluster one
> by one.
>
> It is worth to use broadcast to update a binding service. Then binding
> table updates in all nodes for one shot.
>
> The mechanism is backward compatible because of sending side changing.
>
> Signed-off-by: Hoang Le <[email protected]>
> ---
> net/tipc/bcast.c | 13 +++++++++++++
> net/tipc/bcast.h | 2 ++
> net/tipc/name_table.c | 4 ++--
> 3 files changed, 17 insertions(+), 2 deletions(-)
>
> diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
> index f41096a759fa..18431fa897ab 100644
> --- a/net/tipc/bcast.c
> +++ b/net/tipc/bcast.c
> @@ -843,3 +843,16 @@ void tipc_mcast_filter_msg(struct net *net, struct
> sk_buff_head *defq,
> __skb_queue_tail(inputq, _skb);
> }
> }
> +
> +int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb)
> +{
> + struct sk_buff_head xmitq;
> + u16 cong_link_cnt;
> + int rc = 0;
> +
> + __skb_queue_head_init(&xmitq);
> + __skb_queue_tail(&xmitq, skb);
> + rc = tipc_bcast_xmit(net, &xmitq, &cong_link_cnt);
> + __skb_queue_purge(&xmitq);
> + return rc;
> +}
> diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
> index dadad953e2be..a100da3800fc 100644
> --- a/net/tipc/bcast.h
> +++ b/net/tipc/bcast.h
> @@ -101,6 +101,8 @@ int tipc_bclink_reset_stats(struct net *net);
> u32 tipc_bcast_get_broadcast_mode(struct net *net);
> u32 tipc_bcast_get_broadcast_ratio(struct net *net);
>
> +int tipc_bcast_named_publish(struct net *net, struct sk_buff *skb);
> +
> void tipc_mcast_filter_msg(struct net *net, struct sk_buff_head *defq,
> struct sk_buff_head *inputq);
>
> diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
> index 66a65c2cdb23..9e9c61f7c999 100644
> --- a/net/tipc/name_table.c
> +++ b/net/tipc/name_table.c
> @@ -633,7 +633,7 @@ struct publication *tipc_nametbl_publish(struct net *net,
> u32 type, u32 lower,
> spin_unlock_bh(&tn->nametbl_lock);
>
> if (skb)
> - tipc_node_broadcast(net, skb);
> + tipc_bcast_named_publish(net, skb);
> return p;
> }
>
> @@ -664,7 +664,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32
> lower,
> spin_unlock_bh(&tn->nametbl_lock);
>
> if (skb) {
> - tipc_node_broadcast(net, skb);
> + tipc_bcast_named_publish(net, skb);
> return 1;
> }
> return 0;
> --
> 2.20.1
_______________________________________________
tipc-discussion mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/tipc-discussion