[tipc-discussion] [PATCH net-next 1/3] tipc: transfer broadcast nacks in link state messages

2016-09-01 Thread Jon Maloy
When we send broadcasts in clusters of more 70-80 nodes, we sometimes
see the broadcast link resetting because of an excessive number of
retransmissions. This is caused by a combination of two factors:

1) A 'NACK crunch", where loss of broadcast packets is discovered
   and NACK'ed by several nodes simultaneously, leading to multiple
   redundant broadcast retransmissions.

2) The fact that the NACKS as such also are sent as broadcast, leading
   to excessive load and packet loss on the transmitting switch/bridge.

This commit deals with the latter problem, by moving sending of
broadcast nacks from the dedicated BCAST_PROTOCOL/NACK message type
to regular unicast LINK_PROTOCOL/STATE messages. We allocate 10 unused
bits in word 8 of the said message for this purpose, and introduce a
new capability bit, TIPC_BCAST_STATE_NACK in order to keep the change
backwards compatible.

Reviewed-by: Ying Xue 
Signed-off-by: Jon Maloy 
---
 net/tipc/bcast.c |  8 ---
 net/tipc/bcast.h |  4 ++--
 net/tipc/link.c  | 64 
 net/tipc/link.h  |  6 +++---
 net/tipc/msg.h   | 10 +
 net/tipc/node.c  | 32 ++--
 net/tipc/node.h  | 11 ++
 7 files changed, 108 insertions(+), 27 deletions(-)

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index ae469b3..753f774 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -269,18 +269,19 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link 
*l, u32 acked)
  *
  * RCU is locked, no other locks set
  */
-void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
-struct tipc_msg *hdr)
+int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
+   struct tipc_msg *hdr)
 {
struct sk_buff_head *inputq = _bc_base(net)->inputq;
struct sk_buff_head xmitq;
+   int rc = 0;
 
__skb_queue_head_init();
 
tipc_bcast_lock(net);
if (msg_type(hdr) == STATE_MSG) {
tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), );
-   tipc_link_bc_sync_rcv(l, hdr, );
+   rc = tipc_link_bc_sync_rcv(l, hdr, );
} else {
tipc_link_bc_init_rcv(l, hdr);
}
@@ -291,6 +292,7 @@ void tipc_bcast_sync_rcv(struct net *net, struct tipc_link 
*l,
/* Any socket wakeup messages ? */
if (!skb_queue_empty(inputq))
tipc_sk_rcv(net, inputq);
+   return rc;
 }
 
 /* tipc_bcast_add_peer - add a peer node to broadcast link and bearer
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index d5e79b3..5ffe344 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -56,8 +56,8 @@ int  tipc_bcast_get_mtu(struct net *net);
 int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list);
 int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
 void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked);
-void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
-struct tipc_msg *hdr);
+int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
+   struct tipc_msg *hdr);
 int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg);
 int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]);
 int tipc_bclink_reset_stats(struct net *net);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 2c6e1b9..136316f 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -367,6 +367,18 @@ int tipc_link_bc_peers(struct tipc_link *l)
return l->ackers;
 }
 
+u16 link_bc_rcv_gap(struct tipc_link *l)
+{
+   struct sk_buff *skb = skb_peek(>deferdq);
+   u16 gap = 0;
+
+   if (more(l->snd_nxt, l->rcv_nxt))
+   gap = l->snd_nxt - l->rcv_nxt;
+   if (skb)
+   gap = buf_seqno(skb) - l->rcv_nxt;
+   return gap;
+}
+
 void tipc_link_set_mtu(struct tipc_link *l, int mtu)
 {
l->mtu = mtu;
@@ -1135,7 +1147,10 @@ int tipc_link_build_state_msg(struct tipc_link *l, 
struct sk_buff_head *xmitq)
if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf)
return 0;
l->rcv_unacked = 0;
-   return TIPC_LINK_SND_BC_ACK;
+
+   /* Use snd_nxt to store peer's snd_nxt in broadcast rcv link */
+   l->snd_nxt = l->rcv_nxt;
+   return TIPC_LINK_SND_STATE;
}
 
/* Unicast ACK */
@@ -1236,7 +1251,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff 
*skb,
rc |= tipc_link_input(l, skb, l->inputq);
if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN))
rc |= tipc_link_build_state_msg(l, xmitq);
-   if (unlikely(rc & ~TIPC_LINK_SND_BC_ACK))
+   if (unlikely(rc & ~TIPC_LINK_SND_STATE))
break;
} while ((skb = __skb_dequeue(defq)));
 
@@ -1250,10 +1265,11 @@ 

Re: [tipc-discussion] [PATCH net-next 1/3] tipc: transfer broadcast nacks in link state messages

2016-08-23 Thread Jon Maloy


> -Original Message-
> From: Xue, Ying [mailto:ying@windriver.com]
> Sent: Tuesday, 23 August, 2016 05:48
> To: Jon Maloy ; tipc-discussion@lists.sourceforge.net;
> Parthasarathy Bhuvaragan ; Richard
> Alpe 
> Cc: ma...@donjonn.com; gbala...@gmail.com
> Subject: RE: [PATCH net-next 1/3] tipc: transfer broadcast nacks in link state
> messages
> 
> -Original Message-
> From: Jon Maloy [mailto:jon.ma...@ericsson.com]
> Sent: Wednesday, August 17, 2016 2:09 AM
> To: tipc-discussion@lists.sourceforge.net;
> parthasarathy.bhuvara...@ericsson.com; Xue, Ying; richard.a...@ericsson.com;
> jon.ma...@ericsson.com
> Cc: ma...@donjonn.com; gbala...@gmail.com
> Subject: [PATCH net-next 1/3] tipc: transfer broadcast nacks in link state
> messages
> 
> When we send broadcasts in clusters of more 70-80 nodes, we sometimes see
> the broadcast link resetting because of an excessive number of 
> retransmissions.
> This is caused by a combination of two factors:
> 
> 1) A 'NACK crunch", where loss of broadcast packets is discovered
>and NACK'ed by several nodes simultaneously, leading to multiple
>redundant broadcast retransmissions.
> 
> 2) The fact that the NACKS as such also are sent as broadcast, leading
>to excessive load and packet loss on the transmitting switch/bridge.
> 
> This commit deals with the latter problem, by moving sending of broadcast 
> nacks
> from the dedicated BCAST_PROTOCOL/NACK message type to regular unicast
> LINK_PROTOCOL/STATE messages. We allocate 10 unused bits in word 8 of the
> said message for this purpose, and introduce a new capability bit,
> TIPC_BCAST_STATE_NACK in order to keep the change backwards compatible.
> 
> [Ying] Using unicast to send NACK message is much better than bcast.
> 
> 
> /* tipc_link_bc_sync_rcv - update rcv link according to peer's send state
>   */
> -void tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
> -struct sk_buff_head *xmitq)
> +int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
> +   struct sk_buff_head *xmitq)
>  {
>   u16 peers_snd_nxt = msg_bc_snd_nxt(hdr);
> + u16 from = msg_bcast_ack(hdr) + 1;
> + u16 to = from + msg_bc_gap(hdr) - 1;
> + int rc = 0;
> 
>   if (!link_is_up(l))
> - return;
> + return rc;
> 
>   if (!msg_peer_node_is_up(hdr))
> - return;
> + return rc;
> 
>   /* Open when peer ackowledges our bcast init msg (pkt #1) */
>   if (msg_ack(hdr))
>   l->bc_peer_is_up = true;
> 
>   if (!l->bc_peer_is_up)
> - return;
> + return rc;
> 
>   /* Ignore if peers_snd_nxt goes beyond receive window */
>   if (more(peers_snd_nxt, l->rcv_nxt + l->window))
> - return;
> + return rc;
> +
> + if (!less(to, from)) {
> + rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq);
> + l->stats.recv_nacks++;
> + }
> +
> + l->snd_nxt = peers_snd_nxt;
> + if (link_bc_rcv_gap(l))
> + rc |= TIPC_LINK_SND_STATE;
> +
> + /* Return now if sender supports nack via STATE messages */
> + if (l->peer_caps & TIPC_BCAST_STATE_NACK)
> + return rc;
> +
> 
> [Ying] If peer nodes doesn't support the new NACK way, it seems we not only
> retransmit the missed messages through tipc_link_retrans(), but also we send a
> NACK message created with tipc_link_build_bc_proto_msg(). But in the old
> mode, we don't need to retransmit the missed messages at the moment. Do you
> think this different behavior is right?

In the old code link_bc_sync_rcv() was dealing only with one task: to let the 
receiving endpoint discover gaps in the packet sequence, and if necessary send 
NACKs according to certain rules.

In the new code this function has two tasks: 1) the receiving endpoint still 
discovers gaps, but now sends NACKs unconditionally (in STATE messages).  2) 
The sending endpoint registers reported gaps in msg_bc_gap() (i.e., a received 
NACKs), and performs retransmission as requested.

Now, if the receiving node supports the new algorithm and the sending doesn't, 
the receiver will produce an 'old' NACK message and send it out, but not a 
'new' one. Neither will it retransmit anything, both because there probably is 
nothing to retransmit (it is the receiver), and because msg_bc_gap() can only 
be zero,  so 'to' will be less than 'from', and nothing will ever be 
retransmitted.

If it is the other way around, the sender supports the new algorithm and the 
receiver doesn't, the receiver can only produce an 'old' NACK, and the sender 
will have to retransmit in the (now otherwise unused) tipc_link_bc_nack_rcv() 
function. Once again, msg_bc_gap() can only be zero, so nothing will be 
retransmitted in link_bc_sync_rcv().

So, from an 'old' node's viewpoint there is no 

[tipc-discussion] [PATCH net-next 1/3] tipc: transfer broadcast nacks in link state messages

2016-08-16 Thread Jon Maloy
When we send broadcasts in clusters of more 70-80 nodes, we sometimes
see the broadcast link resetting because of an excessive number of
retransmissions. This is caused by a combination of two factors:

1) A 'NACK crunch", where loss of broadcast packets is discovered
   and NACK'ed by several nodes simultaneously, leading to multiple
   redundant broadcast retransmissions.

2) The fact that the NACKS as such also are sent as broadcast, leading
   to excessive load and packet loss on the transmitting switch/bridge.

This commit deals with the latter problem, by moving sending of
broadcast nacks from the dedicated BCAST_PROTOCOL/NACK message type
to regular unicast LINK_PROTOCOL/STATE messages. We allocate 10 unused
bits in word 8 of the said message for this purpose, and introduce a
new capability bit, TIPC_BCAST_STATE_NACK in order to keep the change
backwards compatible.

Signed-off-by: Jon Maloy 
---
 net/tipc/bcast.c |  8 ---
 net/tipc/bcast.h |  4 ++--
 net/tipc/link.c  | 64 
 net/tipc/link.h  |  6 +++---
 net/tipc/msg.h   | 10 +
 net/tipc/node.c  | 32 ++--
 net/tipc/node.h  | 11 ++
 7 files changed, 108 insertions(+), 27 deletions(-)

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index ae469b3..753f774 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -269,18 +269,19 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link 
*l, u32 acked)
  *
  * RCU is locked, no other locks set
  */
-void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
-struct tipc_msg *hdr)
+int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
+   struct tipc_msg *hdr)
 {
struct sk_buff_head *inputq = _bc_base(net)->inputq;
struct sk_buff_head xmitq;
+   int rc = 0;
 
__skb_queue_head_init();
 
tipc_bcast_lock(net);
if (msg_type(hdr) == STATE_MSG) {
tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr), );
-   tipc_link_bc_sync_rcv(l, hdr, );
+   rc = tipc_link_bc_sync_rcv(l, hdr, );
} else {
tipc_link_bc_init_rcv(l, hdr);
}
@@ -291,6 +292,7 @@ void tipc_bcast_sync_rcv(struct net *net, struct tipc_link 
*l,
/* Any socket wakeup messages ? */
if (!skb_queue_empty(inputq))
tipc_sk_rcv(net, inputq);
+   return rc;
 }
 
 /* tipc_bcast_add_peer - add a peer node to broadcast link and bearer
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index d5e79b3..5ffe344 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -56,8 +56,8 @@ int  tipc_bcast_get_mtu(struct net *net);
 int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list);
 int tipc_bcast_rcv(struct net *net, struct tipc_link *l, struct sk_buff *skb);
 void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l, u32 acked);
-void tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
-struct tipc_msg *hdr);
+int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
+   struct tipc_msg *hdr);
 int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg);
 int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]);
 int tipc_bclink_reset_stats(struct net *net);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 2c6e1b9..136316f 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -367,6 +367,18 @@ int tipc_link_bc_peers(struct tipc_link *l)
return l->ackers;
 }
 
+u16 link_bc_rcv_gap(struct tipc_link *l)
+{
+   struct sk_buff *skb = skb_peek(>deferdq);
+   u16 gap = 0;
+
+   if (more(l->snd_nxt, l->rcv_nxt))
+   gap = l->snd_nxt - l->rcv_nxt;
+   if (skb)
+   gap = buf_seqno(skb) - l->rcv_nxt;
+   return gap;
+}
+
 void tipc_link_set_mtu(struct tipc_link *l, int mtu)
 {
l->mtu = mtu;
@@ -1135,7 +1147,10 @@ int tipc_link_build_state_msg(struct tipc_link *l, 
struct sk_buff_head *xmitq)
if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf)
return 0;
l->rcv_unacked = 0;
-   return TIPC_LINK_SND_BC_ACK;
+
+   /* Use snd_nxt to store peer's snd_nxt in broadcast rcv link */
+   l->snd_nxt = l->rcv_nxt;
+   return TIPC_LINK_SND_STATE;
}
 
/* Unicast ACK */
@@ -1236,7 +1251,7 @@ int tipc_link_rcv(struct tipc_link *l, struct sk_buff 
*skb,
rc |= tipc_link_input(l, skb, l->inputq);
if (unlikely(++l->rcv_unacked >= TIPC_MIN_LINK_WIN))
rc |= tipc_link_build_state_msg(l, xmitq);
-   if (unlikely(rc & ~TIPC_LINK_SND_BC_ACK))
+   if (unlikely(rc & ~TIPC_LINK_SND_STATE))
break;
} while ((skb = __skb_dequeue(defq)));
 
@@ -1250,10 +1265,11 @@ static void tipc_link_build_proto_msg(struct