[tipc-discussion] [PATCH net-next 2/3] tipc: rate limit broadcast retransmissions

2016-09-01 Thread Jon Maloy
As cluster sizes grow, so does the amount of identical or overlapping
broadcast NACKs generated by the packet receivers. This often leads to
'NACK crunches' resulting in huge numbers of redundant retransmissions
of the same packet ranges.

In this commit, we introduce rate control of broadcast retransmissions,
so that a retransmitted range cannot be retransmitted again until after
at least 10 ms. This reduces the frequency of duplicate, redundant
retransmissions by an order of magnitude, while having a significant
positive impact on overall throughput and scalability.

Reviewed-by: Ying Xue 
Signed-off-by: Jon Maloy 
---
 net/tipc/link.c | 52 +++-
 1 file changed, 47 insertions(+), 5 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 136316f..58bb44d 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -181,7 +181,10 @@ struct tipc_link {
u16 acked;
struct tipc_link *bc_rcvlink;
struct tipc_link *bc_sndlink;
-   int nack_state;
+   unsigned long prev_retr;
+   u16 prev_from;
+   u16 prev_to;
+   u8 nack_state;
bool bc_peer_is_up;
 
/* Statistics */
@@ -202,6 +205,8 @@ enum {
BC_NACK_SND_SUPPRESS,
 };
 
+#define TIPC_BC_RETR_LIMIT 10   /* [ms] */
+
 /*
  * Interval between NACKs when packets arrive out of order
  */
@@ -1590,11 +1595,48 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct 
tipc_msg *hdr)
l->rcv_nxt = peers_snd_nxt;
 }
 
+/* link_bc_retr eval()- check if the indicated range can be retransmitted now
+ * - Adjust permitted range if there is overlap with previous retransmission
+ */
+static bool link_bc_retr_eval(struct tipc_link *l, u16 *from, u16 *to)
+{
+   unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr);
+
+   if (less(*to, *from))
+   return false;
+
+   /* New retransmission request */
+   if ((elapsed > TIPC_BC_RETR_LIMIT) ||
+   less(*to, l->prev_from) || more(*from, l->prev_to)) {
+   l->prev_from = *from;
+   l->prev_to = *to;
+   l->prev_retr = jiffies;
+   return true;
+   }
+
+   /* Inside range of previous retransmit */
+   if (!less(*from, l->prev_from) && !more(*to, l->prev_to))
+   return false;
+
+   /* Fully or partially outside previous range => exclude overlap */
+   if (less(*from, l->prev_from)) {
+   *to = l->prev_from - 1;
+   l->prev_from = *from;
+   }
+   if (more(*to, l->prev_to)) {
+   *from = l->prev_to + 1;
+   l->prev_to = *to;
+   }
+   l->prev_retr = jiffies;
+   return true;
+}
+
 /* tipc_link_bc_sync_rcv - update rcv link according to peer's send state
  */
 int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
  struct sk_buff_head *xmitq)
 {
+   struct tipc_link *snd_l = l->bc_sndlink;
u16 peers_snd_nxt = msg_bc_snd_nxt(hdr);
u16 from = msg_bcast_ack(hdr) + 1;
u16 to = from + msg_bc_gap(hdr) - 1;
@@ -1613,14 +1655,14 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct 
tipc_msg *hdr,
if (!l->bc_peer_is_up)
return rc;
 
+   l->stats.recv_nacks++;
+
/* Ignore if peers_snd_nxt goes beyond receive window */
if (more(peers_snd_nxt, l->rcv_nxt + l->window))
return rc;
 
-   if (!less(to, from)) {
-   rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq);
-   l->stats.recv_nacks++;
-   }
+   if (link_bc_retr_eval(snd_l, , ))
+   rc = tipc_link_retrans(snd_l, from, to, xmitq);
 
l->snd_nxt = peers_snd_nxt;
if (link_bc_rcv_gap(l))
-- 
2.7.4


--
___
tipc-discussion mailing list
tipc-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/tipc-discussion


Re: [tipc-discussion] [PATCH net-next 2/3] tipc: rate limit broadcast retransmissions

2016-08-31 Thread Xue, Ying


-Original Message-
From: Jon Maloy [mailto:jon.ma...@ericsson.com] 
Sent: Wednesday, August 31, 2016 12:06 AM
To: Xue, Ying; Jon Maloy; tipc-discussion@lists.sourceforge.net; Parthasarathy 
Bhuvaragan; Richard Alpe
Cc: gbala...@gmail.com
Subject: RE: [PATCH net-next 2/3] tipc: rate limit broadcast retransmissions



> -Original Message-
> From: Xue, Ying [mailto:ying@windriver.com]
> Sent: Tuesday, 30 August, 2016 05:55
> To: Jon Maloy ; Jon Maloy ; 
> tipc-discussion@lists.sourceforge.net; Parthasarathy Bhuvaragan 
> ; Richard Alpe 
> 
> Cc: gbala...@gmail.com
> Subject: RE: [PATCH net-next 2/3] tipc: rate limit broadcast 
> retransmissions
> 
> > +/* link_bc_retr eval()- check if the indicated range can be 
> > +retransmitted now
> > + * - Adjust permitted range if there is overlap with previous 
> > +retransmission  */ static bool link_bc_retr_eval(struct tipc_link 
> > +*l,
> > +u16 *from, u16 *to) {
> > +   unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr);
> > +
> > +   if (less(*to, *from))
> > +   return false;
> > +
> > +   /* New retransmission request */
> > +   if ((elapsed > TIPC_BC_RETR_LIMIT) ||
> > +   less(*to, l->prev_from) || more(*from, l->prev_to)) {
> > +   l->prev_from = *from;
> > +   l->prev_to = *to;
> > +   l->prev_retr = jiffies;
> > +   return true;
> > +   }
> > +
> >
> > [Ying] In the my understanding, the statement above seems to be 
> > changed as
> below:
> >
> > if ((elapsed > TIPC_BC_RETR_LIMIT) && (less(*to, l->prev_from) ||
> more(*from, l->prev_to)))
> >
> > Otherwise, I think the rate of retransmission may be still very high 
> > especially
> when packet disorder frequently happens.
> > As it's very normal that BCast retransmission range requested from 
> > different
> nodes is not same, I guess the rate is not well suppressed in above condition.
> >
> > Maybe, we can use the method which is being effective in 
> > tipc_sk_enqueue()
> to limit the rate like:
> 
> The condition indicates that the two retransmission requests are 
> completely disjoint, e.g., 10-12 and 13-15.
> 
> [Ying]Understood, and it makes sense.
> 
> I don't see any reason why
> we should wait with retransmitting 13-15 in this case, as it is 
> obvious that somebody is missing it, and we haven't retransmitted it before.
> 
> [Ying] Most of cases I observed on bcast before, is that after NACK is 
> just submitted to request sender to retransmit missed packets, the 
> requested packets quickly arrives soon. But tragically sender still 
> retransmits the requested packets through bcast later, leading to 
> waste network bandwidth.  Especially when packets are received in 
> disorder, it will further worsen the situation, which may be one of 
> reasons why we see many duplicated bcast message appearing on network 
> wire. To avoid this case, it's better to a bit postpone the 
> retransmission of messages on sender, on the contrary, we also can 
> restrain the retransmission request on receiver side. In other words, when 
> receiver detects that there is a gap at the first time, it's better to delay 
> a bit time to submit NACK message.
> 

There is an implicit delay in the fact that the gap detection is made through 
looking at STATE messages, and not directly from the packet flow. Reception of 
STATE messages should typically happen after the indicated "last_sent" packet 
already has been received, and after the detection we want one single 
retransmission of the missing packets as soon as possible in order to not waste 
bandwidth.
To further speed up detection and retransmission, you will see that I in the 
next commit introduce detection and NACK-ing directly from the incoming packet 
stream, although it is delayed and coordinated among the receivers.

In brief, I only see positive effects of early gap detection and sending of 
NACKs, as long as we can make an intelligent rate limiting of the actual 
retransmissions.

[Ying] Thanks for the clear explanation, and it makes sense.

> Moreover, I have a more aggressive idea: now that NACK is delivered 
> through unicast, why not use unicast to retransmit bcast messages. 
> This is because what packets missed rarely happens in practice.

I tested this by running the "bcast-blast" program between two peers in a 
200-node network, and I can assure there were plenty of missed packets. 
Moreover, it seems to me that when a packet is lost, it is most often because 
it is dropped by an overwhelmed bridge, so that many, if not all, recipient 
will miss the same packet. So I don't think unicast is the solution.

[Ying] Yes, for packet loss, it's much more efficient to retransmit bcast 
messages through bcast than unicast. But compared to packet loss, packet 
disorder seems to more easily happen. If we cannot well restrain the rate of 
retransmission as packets are out of 

Re: [tipc-discussion] [PATCH net-next 2/3] tipc: rate limit broadcast retransmissions

2016-08-29 Thread Jon Maloy


On 08/25/2016 05:41 AM, Xue, Ying wrote:
>
> -Original Message-
> From: Jon Maloy [mailto:jon.ma...@ericsson.com]
> Sent: Wednesday, August 17, 2016 2:09 AM
> To: tipc-discussion@lists.sourceforge.net; 
> parthasarathy.bhuvara...@ericsson.com; Xue, Ying; richard.a...@ericsson.com; 
> jon.ma...@ericsson.com
> Cc: ma...@donjonn.com; gbala...@gmail.com
> Subject: [PATCH net-next 2/3] tipc: rate limit broadcast retransmissions
>
> As cluster sizes grow, so does the amount of identical or overlapping 
> broadcast NACKs generated by the packet receivers. This often leads to 'NACK 
> crunches' resulting in huge numbers of redundant retransmissions of the same 
> packet ranges.
>
> In this commit, we introduce rate control of broadcast retransmissions, so 
> that a retransmitted range cannot be retransmitted again until after at least 
> 10 ms. This reduces the frequency of duplicate retransmissions by an order of 
> magnitude, while having a significant positive impact on throughput and 
> scalability.
>
> Signed-off-by: Jon Maloy 
> ---
>   net/tipc/link.c | 52 +++-
>   1 file changed, 47 insertions(+), 5 deletions(-)
>
> diff --git a/net/tipc/link.c b/net/tipc/link.c index 136316f..58bb44d 100644
> --- a/net/tipc/link.c
> +++ b/net/tipc/link.c
> @@ -181,7 +181,10 @@ struct tipc_link {
>   u16 acked;
>   struct tipc_link *bc_rcvlink;
>   struct tipc_link *bc_sndlink;
> - int nack_state;
> + unsigned long prev_retr;
> + u16 prev_from;
> + u16 prev_to;
> + u8 nack_state;
>   bool bc_peer_is_up;
>   
>   /* Statistics */
> @@ -202,6 +205,8 @@ enum {
>   BC_NACK_SND_SUPPRESS,
>   };
>   
> +#define TIPC_BC_RETR_LIMIT 10   /* [ms] */
>
> [Ying] I suggest we should define jiffies number rather than 10ms. If so, we 
> don't need to convert l->prev_retr  from jiffies to microsecond.

jiffies is not a uniquely defined entity, as it depends on the clock 
speed of the actual CPU.
I measured 10 ms to be a value that works better than e.g. 2 ms or 50 
ms, so I don't think it is a good idea to add any more randomness to this.

>
> +
>   /*
>* Interval between NACKs when packets arrive out of order
>*/
> @@ -1590,11 +1595,48 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, 
> struct tipc_msg *hdr)
>   l->rcv_nxt = peers_snd_nxt;
>   }
>   
> +/* link_bc_retr eval()- check if the indicated range can be
> +retransmitted now
> + * - Adjust permitted range if there is overlap with previous
> +retransmission  */ static bool link_bc_retr_eval(struct tipc_link *l,
> +u16 *from, u16 *to) {
> + unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr);
> +
> + if (less(*to, *from))
> + return false;
> +
> + /* New retransmission request */
> + if ((elapsed > TIPC_BC_RETR_LIMIT) ||
> + less(*to, l->prev_from) || more(*from, l->prev_to)) {
> + l->prev_from = *from;
> + l->prev_to = *to;
> + l->prev_retr = jiffies;
> + return true;
> + }
> +
>
> [Ying] In the my understanding, the statement above seems to be changed as 
> below:
>
> if ((elapsed > TIPC_BC_RETR_LIMIT) && (less(*to, l->prev_from) || more(*from, 
> l->prev_to)))
>
> Otherwise, I think the rate of retransmission may be still very high 
> especially when packet disorder frequently happens.
> As it's very normal that BCast retransmission range requested from different 
> nodes is not same, I guess the rate is not well suppressed in above condition.
>
> Maybe, we can use the method which is being effective in tipc_sk_enqueue() to 
> limit the rate like:

The condition indicates that the two retransmission requests are 
completely disjoint, e.g., 10-12 and 13-15. I don't see any reason why 
we should wait with retransmitting 13-15 in this case, as it is obvious 
that somebody is missing it, and we haven't retransmitted it before.
But I do understand it can be problematic if re receive repeated 
disjoint nacks, e.g., 10-12, 13-15, 10-12, 13-15 etc. I will make a test 
and see if there is anything to gain from keeping a "history" of the 
retransmissions, so that we never repeat the same packets inside the 
same interval. I'll take a look.

///jon

>
> tipc_sk_enqueue()
> {
> ..
>  while (skb_queue_len(inputq)) {
>  if (unlikely(time_after_eq(jiffies, time_limit)))
>  return;
> ..
> }
>
> And we don’t need to consider the retransmission range.
>
> Regards,
> Ying
>
> + /* Inside range of previous retransmit */
> + if (!less(*from, l->prev_from) && !more(*to, l->prev_to))
> + return false;
> +
> + /* Fully or partially outside previous range => exclude overlap */
> + if (less(*from, l->prev_from)) {
> + *to = l->prev_from - 1;
> + l->prev_from = *from;
> + }
> + if (more(*to, l->prev_to)) {
> + *from = l->prev_to + 1;
> +  

Re: [tipc-discussion] [PATCH net-next 2/3] tipc: rate limit broadcast retransmissions

2016-08-25 Thread Xue, Ying


-Original Message-
From: Jon Maloy [mailto:jon.ma...@ericsson.com] 
Sent: Wednesday, August 17, 2016 2:09 AM
To: tipc-discussion@lists.sourceforge.net; 
parthasarathy.bhuvara...@ericsson.com; Xue, Ying; richard.a...@ericsson.com; 
jon.ma...@ericsson.com
Cc: ma...@donjonn.com; gbala...@gmail.com
Subject: [PATCH net-next 2/3] tipc: rate limit broadcast retransmissions

As cluster sizes grow, so does the amount of identical or overlapping broadcast 
NACKs generated by the packet receivers. This often leads to 'NACK crunches' 
resulting in huge numbers of redundant retransmissions of the same packet 
ranges.

In this commit, we introduce rate control of broadcast retransmissions, so that 
a retransmitted range cannot be retransmitted again until after at least 10 ms. 
This reduces the frequency of duplicate retransmissions by an order of 
magnitude, while having a significant positive impact on throughput and 
scalability.

Signed-off-by: Jon Maloy 
---
 net/tipc/link.c | 52 +++-
 1 file changed, 47 insertions(+), 5 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c index 136316f..58bb44d 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -181,7 +181,10 @@ struct tipc_link {
u16 acked;
struct tipc_link *bc_rcvlink;
struct tipc_link *bc_sndlink;
-   int nack_state;
+   unsigned long prev_retr;
+   u16 prev_from;
+   u16 prev_to;
+   u8 nack_state;
bool bc_peer_is_up;
 
/* Statistics */
@@ -202,6 +205,8 @@ enum {
BC_NACK_SND_SUPPRESS,
 };
 
+#define TIPC_BC_RETR_LIMIT 10   /* [ms] */

[Ying] I suggest we should define jiffies number rather than 10ms. If so, we 
don't need to convert l->prev_retr  from jiffies to microsecond.

+
 /*
  * Interval between NACKs when packets arrive out of order
  */
@@ -1590,11 +1595,48 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct 
tipc_msg *hdr)
l->rcv_nxt = peers_snd_nxt;
 }
 
+/* link_bc_retr eval()- check if the indicated range can be 
+retransmitted now
+ * - Adjust permitted range if there is overlap with previous 
+retransmission  */ static bool link_bc_retr_eval(struct tipc_link *l, 
+u16 *from, u16 *to) {
+   unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr);
+
+   if (less(*to, *from))
+   return false;
+
+   /* New retransmission request */
+   if ((elapsed > TIPC_BC_RETR_LIMIT) ||
+   less(*to, l->prev_from) || more(*from, l->prev_to)) {
+   l->prev_from = *from;
+   l->prev_to = *to;
+   l->prev_retr = jiffies;
+   return true;
+   }
+

[Ying] In the my understanding, the statement above seems to be changed as 
below:

if ((elapsed > TIPC_BC_RETR_LIMIT) && (less(*to, l->prev_from) || more(*from, 
l->prev_to)))

Otherwise, I think the rate of retransmission may be still very high especially 
when packet disorder frequently happens.
As it's very normal that BCast retransmission range requested from different 
nodes is not same, I guess the rate is not well suppressed in above condition.

Maybe, we can use the method which is being effective in tipc_sk_enqueue() to 
limit the rate like:

tipc_sk_enqueue()
{
..
while (skb_queue_len(inputq)) {
if (unlikely(time_after_eq(jiffies, time_limit)))
return;
..
}

And we don’t need to consider the retransmission range.

Regards,
Ying

+   /* Inside range of previous retransmit */
+   if (!less(*from, l->prev_from) && !more(*to, l->prev_to))
+   return false;
+
+   /* Fully or partially outside previous range => exclude overlap */
+   if (less(*from, l->prev_from)) {
+   *to = l->prev_from - 1;
+   l->prev_from = *from;
+   }
+   if (more(*to, l->prev_to)) {
+   *from = l->prev_to + 1;
+   l->prev_to = *to;
+   }
+   l->prev_retr = jiffies;
+   return true;
+}
+
 /* tipc_link_bc_sync_rcv - update rcv link according to peer's send state
  */
 int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
  struct sk_buff_head *xmitq)
 {
+   struct tipc_link *snd_l = l->bc_sndlink;
u16 peers_snd_nxt = msg_bc_snd_nxt(hdr);
u16 from = msg_bcast_ack(hdr) + 1;
u16 to = from + msg_bc_gap(hdr) - 1;
@@ -1613,14 +1655,14 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct 
tipc_msg *hdr,
if (!l->bc_peer_is_up)
return rc;
 
+   l->stats.recv_nacks++;
+
/* Ignore if peers_snd_nxt goes beyond receive window */
if (more(peers_snd_nxt, l->rcv_nxt + l->window))
return rc;
 
-   if (!less(to, from)) {
-   rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq);
-   l->stats.recv_nacks++;
-   }
+   if (link_bc_retr_eval(snd_l, , ))
+   rc = 

[tipc-discussion] [PATCH net-next 2/3] tipc: rate limit broadcast retransmissions

2016-08-16 Thread Jon Maloy
As cluster sizes grow, so does the amount of identical or overlapping
broadcast NACKs generated by the packet receivers. This often leads to
'NACK crunches' resulting in huge numbers of redundant retransmissions
of the same packet ranges.

In this commit, we introduce rate control of broadcast retransmissions,
so that a retransmitted range cannot be retransmitted again until after
at least 10 ms. This reduces the frequency of duplicate retransmissions
by an order of magnitude, while having a significant positive impact on
throughput and scalability.

Signed-off-by: Jon Maloy 
---
 net/tipc/link.c | 52 +++-
 1 file changed, 47 insertions(+), 5 deletions(-)

diff --git a/net/tipc/link.c b/net/tipc/link.c
index 136316f..58bb44d 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -181,7 +181,10 @@ struct tipc_link {
u16 acked;
struct tipc_link *bc_rcvlink;
struct tipc_link *bc_sndlink;
-   int nack_state;
+   unsigned long prev_retr;
+   u16 prev_from;
+   u16 prev_to;
+   u8 nack_state;
bool bc_peer_is_up;
 
/* Statistics */
@@ -202,6 +205,8 @@ enum {
BC_NACK_SND_SUPPRESS,
 };
 
+#define TIPC_BC_RETR_LIMIT 10   /* [ms] */
+
 /*
  * Interval between NACKs when packets arrive out of order
  */
@@ -1590,11 +1595,48 @@ void tipc_link_bc_init_rcv(struct tipc_link *l, struct 
tipc_msg *hdr)
l->rcv_nxt = peers_snd_nxt;
 }
 
+/* link_bc_retr eval()- check if the indicated range can be retransmitted now
+ * - Adjust permitted range if there is overlap with previous retransmission
+ */
+static bool link_bc_retr_eval(struct tipc_link *l, u16 *from, u16 *to)
+{
+   unsigned long elapsed = jiffies_to_msecs(jiffies - l->prev_retr);
+
+   if (less(*to, *from))
+   return false;
+
+   /* New retransmission request */
+   if ((elapsed > TIPC_BC_RETR_LIMIT) ||
+   less(*to, l->prev_from) || more(*from, l->prev_to)) {
+   l->prev_from = *from;
+   l->prev_to = *to;
+   l->prev_retr = jiffies;
+   return true;
+   }
+
+   /* Inside range of previous retransmit */
+   if (!less(*from, l->prev_from) && !more(*to, l->prev_to))
+   return false;
+
+   /* Fully or partially outside previous range => exclude overlap */
+   if (less(*from, l->prev_from)) {
+   *to = l->prev_from - 1;
+   l->prev_from = *from;
+   }
+   if (more(*to, l->prev_to)) {
+   *from = l->prev_to + 1;
+   l->prev_to = *to;
+   }
+   l->prev_retr = jiffies;
+   return true;
+}
+
 /* tipc_link_bc_sync_rcv - update rcv link according to peer's send state
  */
 int tipc_link_bc_sync_rcv(struct tipc_link *l, struct tipc_msg *hdr,
  struct sk_buff_head *xmitq)
 {
+   struct tipc_link *snd_l = l->bc_sndlink;
u16 peers_snd_nxt = msg_bc_snd_nxt(hdr);
u16 from = msg_bcast_ack(hdr) + 1;
u16 to = from + msg_bc_gap(hdr) - 1;
@@ -1613,14 +1655,14 @@ int tipc_link_bc_sync_rcv(struct tipc_link *l, struct 
tipc_msg *hdr,
if (!l->bc_peer_is_up)
return rc;
 
+   l->stats.recv_nacks++;
+
/* Ignore if peers_snd_nxt goes beyond receive window */
if (more(peers_snd_nxt, l->rcv_nxt + l->window))
return rc;
 
-   if (!less(to, from)) {
-   rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq);
-   l->stats.recv_nacks++;
-   }
+   if (link_bc_retr_eval(snd_l, , ))
+   rc = tipc_link_retrans(snd_l, from, to, xmitq);
 
l->snd_nxt = peers_snd_nxt;
if (link_bc_rcv_gap(l))
-- 
2.7.4


--
___
tipc-discussion mailing list
tipc-discussion@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/tipc-discussion