On 7/19/19 11:56 AM, Tuong Lien wrote:
> This commit eliminates the use of the link 'stale_limit' & 'prev_from'
> (besides the already removed - 'stale_cnt') variables in the detection
> of repeated retransmit failures as there is no proper way to initialize
> them to avoid a false detection, i.e. it is not really a retransmission
> failure but due to a garbage values in the variables.
Sorry, I couldn't understand the reason why we have no proper way to
initialize 'stale_limit' & 'prev_from' variables of tipc_link struct.
As far as I understand, the two variables have been set to zero when
tipc_link object is allocated with kzalloc() in tipc_link_create().
Can you please help me understand what real scenario we cannot properly
set them.
>
> Instead, a jiffies variable will be added to individual skbs (like the
> way we restrict the skb retransmissions) in order to mark the first skb
> retransmit time. Later on, at the next retransmissions, the timestamp
> will be checked to see if the skb in the link transmq is "too stale",
> that is, the link tolerance time has passed, so that a link reset will
> be ordered. Note, just checking on the first skb in the queue is fine
> enough since it must be the oldest one.
> A counter is also added to keep track the actual skb retransmissions'
> number for later checking when the failure happens.
>
> The downside of this approach is that the skb->cb[] buffer is about to
> be exhausted, however it is always able to allocate another memory area
> and keep a reference to it when needed.
>
> Fixes: 77cf8edbc0e7 ("tipc: simplify stale link failure criteria")
> Reported-by: Hoang Le <[email protected]>
> Signed-off-by: Tuong Lien <[email protected]>
> ---
> net/tipc/link.c | 92
> ++++++++++++++++++++++++++++++++-------------------------
> net/tipc/msg.h | 8 +++--
> 2 files changed, 57 insertions(+), 43 deletions(-)
>
> diff --git a/net/tipc/link.c b/net/tipc/link.c
> index 66d3a07bc571..c2c5c53cad22 100644
> --- a/net/tipc/link.c
> +++ b/net/tipc/link.c
> @@ -106,8 +106,6 @@ struct tipc_stats {
> * @transmitq: queue for sent, non-acked messages
> * @backlogq: queue for messages waiting to be sent
> * @snt_nxt: next sequence number to use for outbound messages
> - * @prev_from: sequence number of most previous retransmission request
> - * @stale_limit: time when repeated identical retransmits must force link
> reset
> * @ackers: # of peers that needs to ack each packet before it can be
> released
> * @acked: # last packet acked by a certain peer. Used for broadcast.
> * @rcv_nxt: next sequence number to expect for inbound messages
> @@ -164,9 +162,7 @@ struct tipc_link {
> u16 limit;
> } backlog[5];
> u16 snd_nxt;
> - u16 prev_from;
> u16 window;
> - unsigned long stale_limit;
>
> /* Reception */
> u16 rcv_nxt;
> @@ -1044,47 +1040,53 @@ static void tipc_link_advance_backlog(struct
> tipc_link *l,
> * link_retransmit_failure() - Detect repeated retransmit failures
> * @l: tipc link sender
> * @r: tipc link receiver (= l in case of unicast)
> - * @from: seqno of the 1st packet in retransmit request
> * @rc: returned code
> *
> * Return: true if the repeated retransmit failures happens, otherwise
> * false
> */
> static bool link_retransmit_failure(struct tipc_link *l, struct tipc_link *r,
> - u16 from, int *rc)
> + int *rc)
> {
> struct sk_buff *skb = skb_peek(&l->transmq);
> struct tipc_msg *hdr;
>
> if (!skb)
> return false;
> - hdr = buf_msg(skb);
>
> - /* Detect repeated retransmit failures on same packet */
> - if (r->prev_from != from) {
> - r->prev_from = from;
> - r->stale_limit = jiffies + msecs_to_jiffies(r->tolerance);
> - } else if (time_after(jiffies, r->stale_limit)) {
> - pr_warn("Retransmission failure on link <%s>\n", l->name);
> - link_print(l, "State of link ");
> - pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
> - msg_user(hdr), msg_type(hdr), msg_size(hdr),
> - msg_errcode(hdr));
> - pr_info("sqno %u, prev: %x, src: %x\n",
> - msg_seqno(hdr), msg_prevnode(hdr), msg_orignode(hdr));
> -
> - trace_tipc_list_dump(&l->transmq, true, "retrans failure!");
> - trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!");
> - trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!");
> + if (!TIPC_SKB_CB(skb)->retr_cnt)
> + return false;
>
> - if (link_is_bc_sndlink(l))
> - *rc = TIPC_LINK_DOWN_EVT;
> + if (!time_after(jiffies, TIPC_SKB_CB(skb)->retr_stamp +
> + msecs_to_jiffies(r->tolerance)))
> + return false;
> +
> + hdr = buf_msg(skb);
> + if (link_is_bc_sndlink(l) && !less(r->acked, msg_seqno(hdr)))
> + return false;
>
> + pr_warn("Retransmission failure on link <%s>\n", l->name);
> + link_print(l, "State of link ");
> + pr_info("Failed msg: usr %u, typ %u, len %u, err %u\n",
> + msg_user(hdr), msg_type(hdr), msg_size(hdr), msg_errcode(hdr));
> + pr_info("sqno %u, prev: %x, dest: %x\n",
> + msg_seqno(hdr), msg_prevnode(hdr), msg_destnode(hdr));
> + pr_info("retr_stamp %d, retr_cnt %d\n",
> + jiffies_to_msecs(TIPC_SKB_CB(skb)->retr_stamp),
> + TIPC_SKB_CB(skb)->retr_cnt);
> +
> + trace_tipc_list_dump(&l->transmq, true, "retrans failure!");
> + trace_tipc_link_dump(l, TIPC_DUMP_NONE, "retrans failure!");
> + trace_tipc_link_dump(r, TIPC_DUMP_NONE, "retrans failure!");
> +
> + if (link_is_bc_sndlink(l)) {
> + r->state = LINK_RESET;
> + *rc = TIPC_LINK_DOWN_EVT;
> + } else {
> *rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
> - return true;
> }
>
> - return false;
> + return true;
> }
>
> /* tipc_link_bc_retrans() - retransmit zero or more packets
> @@ -1110,7 +1112,7 @@ static int tipc_link_bc_retrans(struct tipc_link *l,
> struct tipc_link *r,
>
> trace_tipc_link_retrans(r, from, to, &l->transmq);
>
> - if (link_retransmit_failure(l, r, from, &rc))
> + if (link_retransmit_failure(l, r, &rc))
> return rc;
>
> skb_queue_walk(&l->transmq, skb) {
> @@ -1119,11 +1121,10 @@ static int tipc_link_bc_retrans(struct tipc_link *l,
> struct tipc_link *r,
> continue;
> if (more(msg_seqno(hdr), to))
> break;
> - if (link_is_bc_sndlink(l)) {
> - if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
> - continue;
> - TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM;
> - }
> +
> + if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
> + continue;
> + TIPC_SKB_CB(skb)->nxt_retr = TIPC_BC_RETR_LIM;
> _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE, GFP_ATOMIC);
> if (!_skb)
> return 0;
> @@ -1133,6 +1134,10 @@ static int tipc_link_bc_retrans(struct tipc_link *l,
> struct tipc_link *r,
> _skb->priority = TC_PRIO_CONTROL;
> __skb_queue_tail(xmitq, _skb);
> l->stats.retransmitted++;
> +
> + /* Increase actual retrans counter & mark first time */
> + if (!TIPC_SKB_CB(skb)->retr_cnt++)
> + TIPC_SKB_CB(skb)->retr_stamp = jiffies;
> }
> return 0;
> }
> @@ -1357,12 +1362,10 @@ static int tipc_link_advance_transmq(struct tipc_link
> *l, u16 acked, u16 gap,
> struct tipc_msg *hdr;
> u16 bc_ack = l->bc_rcvlink->rcv_nxt - 1;
> u16 ack = l->rcv_nxt - 1;
> + bool passed = false;
> u16 seqno, n = 0;
> int rc = 0;
>
> - if (gap && link_retransmit_failure(l, l, acked + 1, &rc))
> - return rc;
> -
I am also unable to understand why we would like to move
link_retransmit_failure() to internal loop of skb_queue_walk_safe() by
adding additional "passed" checking.
Thanks,
Ying
> skb_queue_walk_safe(&l->transmq, skb, tmp) {
> seqno = buf_seqno(skb);
>
> @@ -1372,12 +1375,17 @@ static int tipc_link_advance_transmq(struct tipc_link
> *l, u16 acked, u16 gap,
> __skb_unlink(skb, &l->transmq);
> kfree_skb(skb);
> } else if (less_eq(seqno, acked + gap)) {
> - /* retransmit skb */
> + /* First, check if repeated retrans failures occurs? */
> + if (!passed && link_retransmit_failure(l, l, &rc))
> + return rc;
> + passed = true;
> +
> + /* retransmit skb if unrestricted*/
> if (time_before(jiffies, TIPC_SKB_CB(skb)->nxt_retr))
> continue;
> TIPC_SKB_CB(skb)->nxt_retr = TIPC_UC_RETR_TIME;
> -
> - _skb = __pskb_copy(skb, MIN_H_SIZE, GFP_ATOMIC);
> + _skb = __pskb_copy(skb, LL_MAX_HEADER + MIN_H_SIZE,
> + GFP_ATOMIC);
> if (!_skb)
> continue;
> hdr = buf_msg(_skb);
> @@ -1386,6 +1394,10 @@ static int tipc_link_advance_transmq(struct tipc_link
> *l, u16 acked, u16 gap,
> _skb->priority = TC_PRIO_CONTROL;
> __skb_queue_tail(xmitq, _skb);
> l->stats.retransmitted++;
> +
> + /* Increase actual retrans counter & mark first time */
> + if (!TIPC_SKB_CB(skb)->retr_cnt++)
> + TIPC_SKB_CB(skb)->retr_stamp = jiffies;
> } else {
> /* retry with Gap ACK blocks if any */
> if (!ga || n >= ga->gack_cnt)
> @@ -2577,7 +2589,7 @@ int tipc_link_dump(struct tipc_link *l, u16 dqueues,
> char *buf)
> i += scnprintf(buf + i, sz - i, " %x", l->peer_caps);
> i += scnprintf(buf + i, sz - i, " %u", l->silent_intv_cnt);
> i += scnprintf(buf + i, sz - i, " %u", l->rst_cnt);
> - i += scnprintf(buf + i, sz - i, " %u", l->prev_from);
> + i += scnprintf(buf + i, sz - i, " %u", 0);
> i += scnprintf(buf + i, sz - i, " %u", 0);
> i += scnprintf(buf + i, sz - i, " %u", l->acked);
>
> diff --git a/net/tipc/msg.h b/net/tipc/msg.h
> index da509f0eb9ca..d7ebc9e955f6 100644
> --- a/net/tipc/msg.h
> +++ b/net/tipc/msg.h
> @@ -102,13 +102,15 @@ struct plist;
> #define TIPC_MEDIA_INFO_OFFSET 5
>
> struct tipc_skb_cb {
> - u32 bytes_read;
> - u32 orig_member;
> struct sk_buff *tail;
> unsigned long nxt_retr;
> - bool validated;
> + unsigned long retr_stamp;
> + u32 bytes_read;
> + u32 orig_member;
> u16 chain_imp;
> u16 ackers;
> + u16 retr_cnt;
> + bool validated;
> };
>
> #define TIPC_SKB_CB(__skb) ((struct tipc_skb_cb *)&((__skb)->cb[0]))
>
_______________________________________________
tipc-discussion mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/tipc-discussion