[RFC PATCH net-2.6.24 0/5]: TCP sacktag cache usage recoded
Hi all, After couple of wrong-wayed before/after()s and one infinite loopy version, here's the current trial version of a sacktag cache usage recode Two first patches come from tcp-2.6 (rebased and rotated). This series apply cleanly only on top of the other three patch series I posted earlier today. The last debug patch provides some statistics for those interested enough. Dave, please DO NOT apply! ...Some thoughts could be nice though :-). It should improve processing of such likely events as cumulative ACKs and new forward holed SACK considerably because full walk is not necessary anymore (old code could have been tweaked to cover them but it's better to drop each special case handling altogether and do a generic solution. Redundancy of fastpath hints and highest_sack stuff is also addressed, however, it might have slight effect as the hint could point to something less than highest_sack occassionally, whether that's significant remains to see... In all cases except hint below highest_sack, the new solution should perform at least as well as the old code (with a bit larger constant though, no additional cache misses though) because the SACK blocks old code choose not to process should all fall to LINUX_MIB_TCP_FULLSKIP category. It's possible to improve it easily with RB-tree stuff though this version is based on code using linked lists. I'm not yet too sure that I got everything 100% correct as I tweak start/end_seqs and exit skb loops a way that is prone to off-by-one errors, could miss skb here and there. I'll probably also recode dsack handling too to avoid recursion. Stephen, Sangtae, others? experiencing those unexpected RTOs during recovery of large windowed TCP, could you please give it a try if it helps any... -- i. ps. Our net-2.6.24 (and mainline?) DSACK processing code could be broken btw. DSACK in the middle of other SACK block during in-order walk seems to not be processed at all as the earlier (sorted) block caused skb to advance past it already? (Just occurred to me, I'll see what I can do to that if I've enough time). - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/5] [TCP]: Create tcp_sacktag_state.
From: David S. Miller [EMAIL PROTECTED] It is difficult to break out the inner-logic of tcp_sacktag_write_queue() into worker functions because so many local variables get updated in-place. Start to overcome this by creating a structure block of state variables that can be passed around into worker routines. [I made minor tweaks due to rebase/reordering of stuff in tcp-2.6 tree, and dropped found_dup_sack dup_sack from state -ij] Signed-off-by: David S. Miller [EMAIL PROTECTED] Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 89 ++--- 1 files changed, 47 insertions(+), 42 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 259f517..04ff465 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1102,6 +1102,13 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, return !before(start_seq, end_seq - tp-max_window); } +struct tcp_sacktag_state { + unsigned int flag; + int reord; + int prior_fackets; + u32 lost_retrans; + int first_sack_index; +}; static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, @@ -1146,25 +1153,22 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); struct sk_buff *cached_skb; int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)3; - int reord = tp-packets_out; - int prior_fackets; - u32 lost_retrans = 0; - int flag = 0; - int found_dup_sack = 0; + struct tcp_sacktag_state state; + int found_dup_sack; int cached_fack_count; int i; - int first_sack_index; + int force_one_sack; + + state.flag = 0; if (!tp-sacked_out) { tp-fackets_out = 0; tp-highest_sack = tp-snd_una; } - prior_fackets = tp-fackets_out; - found_dup_sack = tcp_check_dsack(tp, ack_skb, sp, -num_sacks, prior_snd_una); + found_dup_sack = tcp_check_dsack(tp, ack_skb, sp, num_sacks, prior_snd_una); if (found_dup_sack) - flag |= FLAG_DSACKING_ACK; + state.flag |= FLAG_DSACKING_ACK; /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1177,18 +1181,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ * if the only SACK change is the increase of the end_seq of * the first block then only apply that SACK block * and use retrans queue hinting otherwise slowpath */ - flag = 1; + force_one_sack = 1; for (i = 0; i num_sacks; i++) { __be32 start_seq = sp[i].start_seq; __be32 end_seq = sp[i].end_seq; if (i == 0) { if (tp-recv_sack_cache[i].start_seq != start_seq) - flag = 0; + force_one_sack = 0; } else { if ((tp-recv_sack_cache[i].start_seq != start_seq) || (tp-recv_sack_cache[i].end_seq != end_seq)) - flag = 0; + force_one_sack = 0; } tp-recv_sack_cache[i].start_seq = start_seq; tp-recv_sack_cache[i].end_seq = end_seq; @@ -1199,8 +1203,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ tp-recv_sack_cache[i].end_seq = 0; } - first_sack_index = 0; - if (flag) + state.first_sack_index = 0; + if (force_one_sack) num_sacks = 1; else { int j; @@ -1218,17 +1222,14 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ sp[j+1] = tmp; /* Track where the first SACK block goes to */ - if (j == first_sack_index) - first_sack_index = j+1; + if (j == state.first_sack_index) + state.first_sack_index = j+1; } } } } - /* clear flag as used for different purpose in following code */ - flag = 0; - /* Use SACK fastpath hint if valid */ cached_skb = tp-fastpath_skb_hint; cached_fack_count = tp-fastpath_cnt_hint; @@ -1237,12 +1238,16 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ cached_fack_count = 0; } + state.reord = tp
[PATCH 2/5] [TCP]: Create tcp_sacktag_one().
From: David S. Miller [EMAIL PROTECTED] Worker function that implements the main logic of the inner-most loop of tcp_sacktag_write_queue(). Signed-off-by: David S. Miller [EMAIL PROTECTED] Acked-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 213 ++ 1 files changed, 110 insertions(+), 103 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 04ff465..76e9c9b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1143,6 +1143,114 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, return dup_sack; } +static void tcp_sacktag_one(struct sk_buff *skb, struct tcp_sock *tp, + struct tcp_sacktag_state *state, int in_sack, + int dup_sack, int fack_count, u32 end_seq) +{ + u8 sacked = TCP_SKB_CB(skb)-sacked; + + /* Account D-SACK for retransmitted packet. */ + if ((dup_sack in_sack) + (sacked TCPCB_RETRANS) + after(TCP_SKB_CB(skb)-end_seq, tp-undo_marker)) + tp-undo_retrans--; + + /* The frame is ACKed. */ + if (!after(TCP_SKB_CB(skb)-end_seq, tp-snd_una)) { + if (sacked TCPCB_RETRANS) { + if ((dup_sack in_sack) + (sacked TCPCB_SACKED_ACKED)) + state-reord = min(fack_count, state-reord); + } else { + /* If it was in a hole, we detected reordering. */ + if (fack_count state-prior_fackets + !(sacked TCPCB_SACKED_ACKED)) + state-reord = min(fack_count, state-reord); + } + + /* Nothing to do; acked frame is about to be dropped. */ + return; + } + + if ((sacked TCPCB_SACKED_RETRANS) + after(end_seq, TCP_SKB_CB(skb)-ack_seq) + (!state-lost_retrans || after(end_seq, state-lost_retrans))) + state-lost_retrans = end_seq; + + if (!in_sack) + return; + + if (!(sacked TCPCB_SACKED_ACKED)) { + if (sacked TCPCB_SACKED_RETRANS) { + /* If the segment is not tagged as lost, +* we do not clear RETRANS, believing +* that retransmission is still in flight. +*/ + if (sacked TCPCB_LOST) { + TCP_SKB_CB(skb)-sacked = + ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); + tp-lost_out -= tcp_skb_pcount(skb); + tp-retrans_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp-retransmit_skb_hint = NULL; + } + } else { + /* New sack for not retransmitted frame, +* which was in hole. It is reordering. +*/ + if (!(sacked TCPCB_RETRANS) + fack_count state-prior_fackets) + state-reord = min(fack_count, state-reord); + + if (sacked TCPCB_LOST) { + TCP_SKB_CB(skb)-sacked = ~TCPCB_LOST; + tp-lost_out -= tcp_skb_pcount(skb); + + /* clear lost hint */ + tp-retransmit_skb_hint = NULL; + } + /* SACK enhanced F-RTO detection. +* Set flag if and only if non-rexmitted +* segments below frto_highmark are +* SACKed (RFC4138; Appendix B). +* Clearing correct due to in-order walk +*/ + if (after(end_seq, tp-frto_highmark)) { + state-flag = ~FLAG_ONLY_ORIG_SACKED; + } else { + if (!(sacked TCPCB_RETRANS)) + state-flag |= FLAG_ONLY_ORIG_SACKED; + } + } + + TCP_SKB_CB(skb)-sacked |= TCPCB_SACKED_ACKED; + state-flag |= FLAG_DATA_SACKED; + tp-sacked_out += tcp_skb_pcount(skb); + + if (fack_count tp-fackets_out) + tp-fackets_out = fack_count; + + if (after(TCP_SKB_CB(skb)-seq, + tp-highest_sack)) + tp-highest_sack = TCP_SKB_CB(skb)-seq; + } else { + if (dup_sack (sackedTCPCB_RETRANS)) + state-reord = min(fack_count, state-reord); + } + + /* D-SACK. We can detect redundant retransmission +* in S|R and plain R frames and clear
[RFC PATCH 4/5] [TCP]: Rewrite sack_recv_cache (WIP)
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] Previously a number of cases in TCP SACK processing fail to take advantage of costly stored information in sack_recv_cache. Most importantly expected events such as cumulative ACK, new hole ACKs and first ACK after RTO fall to this category. Processing on such ACKs result in rather long walks building up latencies (which easily gets nasty when window is large), which are completely unnecessary, usually no new information was gathered except the new SACK block above the hole in the respective case. Since the inclusion of highest_sack, there's a lot information that is very likely redundant (SACK fastpath hint stuff, fackets_out, highest_sack), though there's no ultimate guarantee that they'll remain the same whole the time (in all unearthly scenarios). Take advantage of this too and drop fastpath hint. Effectively this drops special cased fastpath. This change adds some complexity to introduce better coveraged fastpath. The current ACK's SACK blocks are compared against each cached block individially and only ranges that are new are then scanned by the high constant walk. For other parts of write queue, even when in previously known part of the SACK blocks, a faster skip function is used. In addition, whenever possible, TCP fast-forwards to highest_sack skb that was made available earlier. In typical case, no other things but this fast-forward and mandatory markings after that occur making the access pattern quite similar to the former fastpath. DSACKs are special case that must always be walked. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/linux/tcp.h |4 +- include/net/tcp.h |1 - net/ipv4/tcp_input.c | 320 ++-- net/ipv4/tcp_output.c | 12 +-- 4 files changed, 202 insertions(+), 135 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1d6be2a..8d91eac 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -330,7 +330,7 @@ struct tcp_sock { struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ - struct tcp_sack_block_wire recv_sack_cache[4]; + struct tcp_sack_block recv_sack_cache[4]; struct sk_buff *highest_sack; /* highest skb with SACK received * (validity guaranteed only if @@ -343,9 +343,7 @@ struct tcp_sock { struct sk_buff *scoreboard_skb_hint; struct sk_buff *retransmit_skb_hint; struct sk_buff *forward_skb_hint; - struct sk_buff *fastpath_skb_hint; - int fastpath_cnt_hint; int lost_cnt_hint; int retransmit_cnt_hint; diff --git a/include/net/tcp.h b/include/net/tcp.h index 8bc64b7..d5def9b 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1078,7 +1078,6 @@ static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { tcp_clear_retrans_hints_partial(tp); - tp-fastpath_skb_hint = NULL; } /* MD5 Signature */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 85dd4b0..9dfdd67 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1106,11 +1106,15 @@ struct tcp_sacktag_state { unsigned int flag; int reord; int prior_fackets; + int fack_count; u32 lost_retrans; - int first_sack_index; + u32 dup_start; + u32 dup_end; }; -static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, +static int tcp_check_dsack(struct tcp_sock *tp, + struct tcp_sacktag_state *state, + struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) { @@ -1120,6 +1124,8 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, if (before(start_seq_0, TCP_SKB_CB(ack_skb)-ack_seq)) { dup_sack = 1; + state-dup_start = start_seq_0; + state-dup_end = end_seq_0; tcp_dsack_seen(tp); NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); } else if (num_sacks 1) { @@ -1129,6 +1135,8 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, if (!after(end_seq_0, end_seq_1) !before(start_seq_0, start_seq_1)) { dup_sack = 1; + state-dup_start = start_seq_1; + state-dup_end = end_seq_1; tcp_dsack_seen(tp); NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); } @@ -1251,6 +1259,104 @@ static void tcp_sacktag_one(struct sk_buff *skb, struct sock *sk, } } +static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk
[DEVELOPER PATCH 5/5] [TCP]: Track sacktag
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/linux/snmp.h | 20 +++ net/ipv4/proc.c | 20 +++ net/ipv4/tcp_input.c | 52 +++-- 3 files changed, 89 insertions(+), 3 deletions(-) diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 89f0c2b..42b8c07 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -214,6 +214,26 @@ enum LINUX_MIB_TCPDSACKIGNOREDOLD, /* TCPSACKIgnoredOld */ LINUX_MIB_TCPDSACKIGNOREDNOUNDO,/* TCPSACKIgnoredNoUndo */ LINUX_MIB_TCPSPURIOUSRTOS, /* TCPSpuriousRTOs */ + LINUX_MIB_TCP_SACKTAG, + LINUX_MIB_TCP_SACK0, + LINUX_MIB_TCP_SACK1, + LINUX_MIB_TCP_SACK2, + LINUX_MIB_TCP_SACK3, + LINUX_MIB_TCP_SACK4, + LINUX_MIB_TCP_WALKEDSKBS, + LINUX_MIB_TCP_WALKEDDSACKS, + LINUX_MIB_TCP_SKIPPEDSKBS, + LINUX_MIB_TCP_NOCACHE, + LINUX_MIB_TCP_FULLWALK, + LINUX_MIB_TCP_HEADWALK, + LINUX_MIB_TCP_TAILWALK, + LINUX_MIB_TCP_FULLSKIP, + LINUX_MIB_TCP_TAILSKIP, + LINUX_MIB_TCP_HEADSKIP, + LINUX_MIB_TCP_FULLSKIP_TOHIGH, + LINUX_MIB_TCP_TAILSKIP_TOHIGH, + LINUX_MIB_TCP_NEWSKIP, + LINUX_MIB_TCP_CACHEREMAINING, __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 9dee70e..4808f82 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -246,6 +246,26 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM(TCPDSACKIgnoredOld, LINUX_MIB_TCPDSACKIGNOREDOLD), SNMP_MIB_ITEM(TCPDSACKIgnoredNoUndo, LINUX_MIB_TCPDSACKIGNOREDNOUNDO), SNMP_MIB_ITEM(TCPSpuriousRTOs, LINUX_MIB_TCPSPURIOUSRTOS), + SNMP_MIB_ITEM(TCP_SACKTAG, LINUX_MIB_TCP_SACKTAG), + SNMP_MIB_ITEM(TCP_SACK0, LINUX_MIB_TCP_SACK0), + SNMP_MIB_ITEM(TCP_SACK1, LINUX_MIB_TCP_SACK1), + SNMP_MIB_ITEM(TCP_SACK2, LINUX_MIB_TCP_SACK2), + SNMP_MIB_ITEM(TCP_SACK3, LINUX_MIB_TCP_SACK3), + SNMP_MIB_ITEM(TCP_SACK4, LINUX_MIB_TCP_SACK4), + SNMP_MIB_ITEM(TCP_WALKEDSKBS, LINUX_MIB_TCP_WALKEDSKBS), + SNMP_MIB_ITEM(TCP_WALKEDDSACKS, LINUX_MIB_TCP_WALKEDDSACKS), + SNMP_MIB_ITEM(TCP_SKIPPEDSKBS, LINUX_MIB_TCP_SKIPPEDSKBS), + SNMP_MIB_ITEM(TCP_NOCACHE, LINUX_MIB_TCP_NOCACHE), + SNMP_MIB_ITEM(TCP_FULLWALK, LINUX_MIB_TCP_FULLWALK), + SNMP_MIB_ITEM(TCP_HEADWALK, LINUX_MIB_TCP_HEADWALK), + SNMP_MIB_ITEM(TCP_TAILWALK, LINUX_MIB_TCP_TAILWALK), + SNMP_MIB_ITEM(TCP_FULLSKIP, LINUX_MIB_TCP_FULLSKIP), + SNMP_MIB_ITEM(TCP_TAILSKIP, LINUX_MIB_TCP_TAILSKIP), + SNMP_MIB_ITEM(TCP_HEADSKIP, LINUX_MIB_TCP_HEADSKIP), + SNMP_MIB_ITEM(TCP_FULLSKIP_TOHIGH, LINUX_MIB_TCP_FULLSKIP_TOHIGH), + SNMP_MIB_ITEM(TCP_TAILSKIP_TOHIGH, LINUX_MIB_TCP_TAILSKIP_TOHIGH), + SNMP_MIB_ITEM(TCP_NEWSKIP, LINUX_MIB_TCP_NEWSKIP), + SNMP_MIB_ITEM(TCP_CACHEREMAINING, LINUX_MIB_TCP_CACHEREMAINING), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 9dfdd67..38045c8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1306,6 +1306,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, tcp_sacktag_one(skb, sk, state, in_sack, dup_sack, state-fack_count, end_seq); + + NET_INC_STATS_BH(LINUX_MIB_TCP_WALKEDSKBS); + if (dup_sack) + NET_INC_STATS_BH(LINUX_MIB_TCP_WALKEDDSACKS); } return skb; } @@ -1329,6 +1333,7 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff *skb, struct sock *sk, skb = tcp_sacktag_walk(skb, sk, state, state-dup_start, state-dup_end, 1); } + NET_INC_STATS_BH(LINUX_MIB_TCP_SKIPPEDSKBS); } return skb; } @@ -1458,9 +1463,22 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ cache++; } + NET_INC_STATS_BH(LINUX_MIB_TCP_SACKTAG); + switch (used_sacks) { + case 0: NET_INC_STATS_BH(LINUX_MIB_TCP_SACK0); break; + case 1: NET_INC_STATS_BH(LINUX_MIB_TCP_SACK1); break; + case 2: NET_INC_STATS_BH(LINUX_MIB_TCP_SACK2); break; + case 3: NET_INC_STATS_BH(LINUX_MIB_TCP_SACK3); break; + case 4: NET_INC_STATS_BH(LINUX_MIB_TCP_SACK4); break; + } + + if (!tcp_sack_cache_ok(tp, cache)) + NET_INC_STATS_BH(LINUX_MIB_TCP_NOCACHE); + while (i used_sacks) { u32 start_seq = sp[i].start_seq; u32 end_seq = sp[i].end_seq; + int fullwalk = 0; /* Event B in the comment above. */ if (after(end_seq, tp-high_seq)) @@ -1473,41 +1491,69
Re: [PATCH 1/2] David Miller's rbtree patches for 2.6.22.6
On Wed, 19 Sep 2007, Tom Quetchenbach wrote: Patch 1: David Miller's red-black tree code, tweaked for 2.6.22.6, with some bugfixes It would help if you would leave the original changes as is (rb-tree and fack_count separated) and add your work on top of that... diff -ur linux-2.6.22.6/include/net/tcp.h linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h --- linux-2.6.22.6/include/net/tcp.h 2007-08-30 23:21:01.0 -0700 +++ linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h 2007-09-19 17:36:07.0 -0700 @@ -540,6 +540,7 @@ __u32 seq;/* Starting sequence number */ __u32 end_seq;/* SEQ + FIN + SYN + datalen*/ __u32 when; /* used to compute rtt's*/ + unsigned intfack_count; /* speed up SACK processing */ __u8flags; /* TCP header flags.*/ /* NOTE: These must match up to the flags byte in a @@ -1043,12 +1044,12 @@ } /*from STCP */ -static inline void clear_all_retrans_hints(struct tcp_sock *tp){ +static inline void clear_all_retrans_hints(struct tcp_sock *tp) +{ Unrelated change, please don't do that. Besides, it's already fixed in net-2.6.24. tp-lost_skb_hint = NULL; tp-scoreboard_skb_hint = NULL; tp-retransmit_skb_hint = NULL; tp-forward_skb_hint = NULL; - tp-fastpath_skb_hint = NULL; } /* MD5 Signature */ @@ -1227,9 +1229,61 @@ sk-sk_send_head = NULL; } +static inline struct sk_buff *tcp_write_queue_find(struct sock *sk, __u32 seq) +{ + struct rb_node *rb_node = tcp_sk(sk)-write_queue_rb.rb_node; + struct sk_buff *skb = NULL; + + while (rb_node) { + struct sk_buff *tmp = rb_entry(rb_node,struct sk_buff,rb); + if (TCP_SKB_CB(tmp)-end_seq seq) { This is old and buggy version of the rb-tree code. Get the latest rb-tree patch from tcp-2.6 tree. + skb = tmp; + if (TCP_SKB_CB(tmp)-seq = seq) ...fixed in tcp-2.6. + break; + rb_node = rb_node-rb_left; + } else + rb_node = rb_node-rb_right; + + } + return skb; +} + +static inline void tcp_rb_insert(struct sk_buff *skb, struct rb_root *root) +{ + struct rb_node **rb_link, *rb_parent; + __u32 seq = TCP_SKB_CB(skb)-seq; + + rb_link = root-rb_node; + rb_parent = NULL; + while (*rb_link != NULL) { + struct sk_buff *tmp = rb_entry(*rb_link,struct sk_buff,rb); + rb_parent = *rb_link; + if (TCP_SKB_CB(tmp)-end_seq seq) { + BUG_ON(TCP_SKB_CB(tmp)-seq = seq); ...these are broken as well. + rb_link = rb_parent-rb_left; + } else { + rb_link = rb_parent-rb_right; + } + } + rb_link_node(skb-rb, rb_parent, rb_link); + rb_insert_color(skb-rb, root); +} + +static inline void tcp_rb_unlink(struct sk_buff *skb, struct rb_root *root) +{ + rb_erase(skb-rb, root); +} + static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { + struct sk_buff *tail = tcp_write_queue_tail(sk); + unsigned int fc = 0; + + if (tail) + fc = TCP_SKB_CB(tail)-fack_count + tcp_skb_pcount(tail); + TCP_SKB_CB(skb)-fack_count = fc; __skb_queue_tail(sk-sk_write_queue, skb); + tcp_rb_insert(skb, tcp_sk(sk)-write_queue_rb); } static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) diff -ur linux-2.6.22.6/net/ipv4/tcp_input.c linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c --- linux-2.6.22.6/net/ipv4/tcp_input.c 2007-08-30 23:21:01.0 -0700 +++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c2007-09-13 18:23:16.0 -0700 @@ -947,14 +947,13 @@ unsigned char *ptr = (skb_transport_header(ack_skb) + TCP_SKB_CB(ack_skb)-sacked); struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); - struct sk_buff *cached_skb; int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)3; int reord = tp-packets_out; int prior_fackets; u32 lost_retrans = 0; int flag = 0; int found_dup_sack = 0; - int cached_fack_count; + int fack_count_base; int i; int first_sack_index; @@ -1020,7 +1019,6 @@ num_sacks = 1; else { int j; - tp-fastpath_skb_hint = NULL; /* order SACK blocks to allow in order walk of the retrans queue */ for (i = num_sacks-1; i 0; i--) { @@ -1045,14 +1043,7 @@ /* clear flag as used for different purpose in following code */ flag = 0; - /* Use SACK fastpath hint if valid */ - cached_skb = tp-fastpath_skb_hint; -
Re: [PATCH 2/2] David Miller's rbtree patches for 2.6.22.6
On Wed, 19 Sep 2007, Tom Quetchenbach wrote: Patch 2: fixes to fack_counts and enhancement of SACK fast path ...Usually these are not combined in patches but a separate patch per change. diff -ur linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h linux-2.6.22.6-rbtree-tomq/include/net/tcp.h --- linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h 2007-09-19 17:36:07.0 -0700 +++ linux-2.6.22.6-rbtree-tomq/include/net/tcp.h 2007-09-19 12:22:06.0 -0700 @@ -1213,6 +1213,11 @@ ...Btw, please always use -p to diff as well as it helps review :-). sk-sk_send_head = tcp_write_queue_next(sk, skb); if (sk-sk_send_head == (struct sk_buff *)sk-sk_write_queue) sk-sk_send_head = NULL; + else + /* update fack_count of send_head. Since we've sent skb already, + * its packet count must be set by now. */ + TCP_SKB_CB(sk-sk_send_head)-fack_count = + TCP_SKB_CB(skb)-fack_count + tcp_skb_pcount(skb); /* Don't override Nagle indefinately with F-RTO */ if (tp-frto_counter == 2) tp-frto_counter = 3; @@ -1310,19 +1315,22 @@ /* An insert into the middle of the write queue causes the fack * counts in subsequent packets to become invalid, fix them up. */ -static inline void tcp_reset_fack_counts(struct sock *sk, struct sk_buff *first) +static inline void tcp_reset_fack_counts(struct sock *sk, struct sk_buff *skb) { - struct sk_buff *prev = first-prev; + struct sk_buff *prev = skb-prev; unsigned int fc = 0; if (prev != (struct sk_buff *) sk-sk_write_queue) fc = TCP_SKB_CB(prev)-fack_count + tcp_skb_pcount(prev); - while (first != (struct sk_buff *)sk-sk_write_queue) { - TCP_SKB_CB(first)-fack_count = fc; + while (skb != (struct sk_buff *)sk-sk_write_queue) { + if (TCP_SKB_CB(skb)-fack_count == fc || !tcp_skb_pcount(skb)) What is this !tcp_skb_pcount(skb) for I think what we really want here is this:|| (skb == tcp_send_head(sk)) + break; - fc += tcp_skb_pcount(first); - first = first-next; + TCP_SKB_CB(skb)-fack_count = fc; + + fc += tcp_skb_pcount(skb); + skb = skb-next; } } diff -ur linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c linux-2.6.22.6-rbtree-tomq/net/ipv4/tcp_input.c --- linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c2007-09-13 18:23:16.0 -0700 +++ linux-2.6.22.6-rbtree-tomq/net/ipv4/tcp_input.c 2007-09-19 12:27:42.0 -0700 @@ -956,6 +956,7 @@ int fack_count_base; int i; int first_sack_index; + u32 prev_end_seq = 0; if (!tp-sacked_out) tp-fackets_out = 0; @@ -1000,6 +1001,7 @@ if (i == 0) { if (tp-recv_sack_cache[i].start_seq != start_seq) flag = 0; + prev_end_seq = ntohl(tp-recv_sack_cache[i].end_seq); } else { if ((tp-recv_sack_cache[i].start_seq != start_seq) || (tp-recv_sack_cache[i].end_seq != end_seq)) @@ -1016,9 +1018,16 @@ first_sack_index = 0; if (flag) + /* all that has changed is end of first SACK block. So all we + * need to do is tag those skbs that were'nt tagged last time. */ IMHO, a bit verbose comment. Besides, we already tell above that what SACK fastpath is all about... num_sacks = 1; else { int j; + + /* more than just end of first SACK block has changed; invalidate + * prev_end_seq */ + + prev_end_seq = 0; Don't tell what's obvious from code as well. /* order SACK blocks to allow in order walk of the retrans queue */ for (i = num_sacks-1; i 0; i--) { @@ -1051,6 +1060,8 @@ int fack_count; int dup_sack = (found_dup_sack (i == first_sack_index)); + if (prev_end_seq) start_seq = prev_end_seq; + ...We never add statements on the same line after if statements (see Documentation/CodingStyle) -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 0/2] David Miller's rbtree patches for 2.6.22.6
On Wed, 19 Sep 2007, Tom Quetchenbach wrote: Here are a couple of patches against 2.6.22.6. The first one is just David's patches tweaked for 2.6.22.6, with a couple of minor bugfixes to get it to compile and not crash. Why did you combine original patches to a single larger one, I think Dave made them separate on purpose. (I also changed __tcp_insert_write_queue_tail() to set the fack_count of the new packet to the fack_count of the tail plus the packet count of the tail, not the packet count of the new skb, because I think that's how it was intended to be. Right? I think I noticed similar off-by-pcount error when I looked that long time ago, so I guess you're correct. We're only interested in delta of it anyway and add the current skb's pcount to it (which is not fixed until tcp_fragment in sacktag is past). In the second patch there are a couple of significant changes. One is (as Baruch suggested) to modify the existing SACK fast path so that we don't tag packets we've already tagged when we advance by a packet. This solution would still spend extensive amount of time in processing loop, whenever recv_sack_cache fast-path is not taken, that is, e.g. when cumulative ACK after retransmissions arrive or new hole becomes visible (which are not very exceptional events after all :-)). In the cumulative ACK case especially, this processing is very likely _fully_ wasted walking. So there is still room for large improvements. I've made an improved version of the current sacktag walk couple of days ago (it's in a state where it didn't crash but likely very buggy still), I'll post it here soon... Idea is embed recv_sack_cache checking fully into the walking loop. By doing that an previously known work is not duplicated. The patch is currently against non-rbtree stuff but incorporating rb-tree things on top of it should be very trivial and synergy benefits with rbtree should be considerable because non-rbtree has to do fast walk skipping for skbs that are under highest_sack which is prone to cache misses. The other issue is that the cached fack_counts seem to be wrong, because they're set when we insert into the queue, but tcp_set_tso_segs() is called later, just before we send, so all the fack_counts are zero. My solution was to set the fack_count when we advance the send_head. I think it's better solution anyway, since we might have to do reset_fack_counts() in between and there's no need to update past sk_send_head. Also I changed tcp_reset_fack_counts() so that it exits when it hits an skb whose tcp_skb_pcount() is zero Do you mind to explain what's the purpose of that? or whose fack_count is already correct. (This really helps when TSO is on, since there's lots of inserting into the middle of the queue.) Good point. Please let me know how I can help get this tested and debugged. Most network development happens against latest net-2.6(.x) trees. In addition, there's experimental tcp-2.6 tree but it's currently a bit outdated already (and DaveM is very busy with the phenomenal merge they're doing for 2.6.24 :-) so it's not too likely to be updated very soon). ...Anyway, thanks for your interest in these things. -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-2.6.24 0/9]: TCP improvements cleanups
Hi Dave, Just in case you're short on what to do ;-) here are some TCP related cleanups improvements to net-2.6.24. Including FRTO undo fix which finally should allow FRTO to be turned on, and some simple fastpath tweaks simple enough to the 2.6.24 schedule. ...I've a larger fastpath_hint removal patch coming up later too but it's really a monster which needs more time though I guess it could really cut down the SACK processing latencies people are experience with high-speed flows (I'll probably post it with RFC once you've picked these up). These were boot ( couple of hours) tested on the top of net-2.6.24 (something after the first large rebase you did, so you could count that as success report of it too :-)). Not sure if all those fragment/collapse paths I modified got executed though. -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/9] [TCP]: Maintain highest_sack accurately to the highest skb
In general, it should not be necessary to call tcp_fragment for already SACKed skbs, but it's better to be safe than sorry. And indeed, it can be called from sacktag when a DSACK arrives or some ACK (with SACK) reordering occurs (sacktag could be made to avoid the call in the latter case though I'm not sure if it's worth of the trouble and added complexity to cover such marginal case). The collapse case has return for SACKED_ACKED case earlier, so just WARN_ON if internal inconsistency is detected for some reason. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_output.c |7 +++ 1 files changed, 7 insertions(+), 0 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index d65d17b..9df5b2a 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -692,6 +692,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss TCP_SKB_CB(buff)-end_seq = TCP_SKB_CB(skb)-end_seq; TCP_SKB_CB(skb)-end_seq = TCP_SKB_CB(buff)-seq; + if (tp-sacked_out (TCP_SKB_CB(skb)-seq == tp-highest_sack)) + tp-highest_sack = TCP_SKB_CB(buff)-seq; + /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)-flags; TCP_SKB_CB(skb)-flags = flags ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); @@ -1723,6 +1726,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m /* Update sequence range on original skb. */ TCP_SKB_CB(skb)-end_seq = TCP_SKB_CB(next_skb)-end_seq; + if (WARN_ON(tp-sacked_out + (TCP_SKB_CB(next_skb)-seq == tp-highest_sack))) + return; + /* Merge over control information. */ flags |= TCP_SKB_CB(next_skb)-flags; /* This moves PSH/FIN etc. over */ TCP_SKB_CB(skb)-flags = flags; -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/9] [TCP]: Make fackets_out accurate
Substraction for fackets_out is unconditional when snd_una advances, thus there's no need to do it inside the loop. Just make sure correct bounds are honored. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 10 +++--- net/ipv4/tcp_output.c | 44 ++-- 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fd0ae4d..09b6b1d 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2302,8 +2302,8 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) * 1. Reno does not count dupacks (sacked_out) automatically. */ if (!tp-packets_out) tp-sacked_out = 0; - /* 2. SACK counts snd_fack in packets inaccurately. */ - if (tp-sacked_out == 0) + + if (WARN_ON(!tp-sacked_out tp-fackets_out)) tp-fackets_out = 0; /* Now state machine starts. @@ -2571,10 +2571,6 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, } else if (*seq_rtt 0) *seq_rtt = now - scb-when; - if (tp-fackets_out) { - __u32 dval = min(tp-fackets_out, packets_acked); - tp-fackets_out -= dval; - } tp-packets_out -= packets_acked; BUG_ON(tcp_skb_pcount(skb) == 0); @@ -2657,7 +2653,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) seq_rtt = now - scb-when; last_ackt = skb-tstamp; } - tcp_dec_pcount_approx(tp-fackets_out, skb); tp-packets_out -= tcp_skb_pcount(skb); tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); @@ -2672,6 +2667,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_rearm_rto(sk); + tp-fackets_out -= min(pkts_acked, tp-fackets_out); if (tcp_is_reno(tp)) tcp_remove_reno_sacks(sk, pkts_acked); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9df5b2a..cbe8bf6 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -652,6 +652,26 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned } } +/* When a modification to fackets out becomes necessary, we need to check + * skb is counted to fackets_out or not. Another important thing is to + * tweak SACK fastpath hint too as it would overwrite all changes unless + * hint is also changed. + */ +static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb, + int decr) +{ + if (!tp-sacked_out) + return; + + if (!before(tp-highest_sack, TCP_SKB_CB(skb)-seq)) + tp-fackets_out -= decr; + + /* cnt_hint is off-by-one compared with fackets_out (see sacktag) */ + if (tp-fastpath_skb_hint != NULL + after(TCP_SKB_CB(tp-fastpath_skb_hint)-seq, TCP_SKB_CB(skb)-seq)) + tp-fastpath_cnt_hint -= decr; +} + /* Function to create two new TCP segments. Shrinks the given segment * to the specified size and appends a new segment with the rest of the * packet to the list. This won't be called frequently, I hope. @@ -746,21 +766,12 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss if (TCP_SKB_CB(skb)-sacked TCPCB_LOST) tp-lost_out -= diff; - if (diff 0) { - /* Adjust Reno SACK estimate. */ - if (tcp_is_reno(tp)) { - tcp_dec_pcount_approx_int(tp-sacked_out, diff); - tcp_verify_left_out(tp); - } - - tcp_dec_pcount_approx_int(tp-fackets_out, diff); - /* SACK fastpath might overwrite it unless dealt with */ - if (tp-fastpath_skb_hint != NULL - after(TCP_SKB_CB(tp-fastpath_skb_hint)-seq, - TCP_SKB_CB(skb)-seq)) { - tcp_dec_pcount_approx_int(tp-fastpath_cnt_hint, diff); - } + /* Adjust Reno SACK estimate. */ + if (tcp_is_reno(tp) diff 0) { + tcp_dec_pcount_approx_int(tp-sacked_out, diff); + tcp_verify_left_out(tp); } + tcp_adjust_fackets_out(tp, skb, diff); } /* Link BUFF into the send queue. */ @@ -1746,10 +1757,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m if (tcp_is_reno(tp) tp-sacked_out) tcp_dec_pcount_approx(tp-sacked_out, next_skb
[PATCH 3/9] [TCP]: clear_all_retrans_hints prefixed by tcp_
In addition, fix its function comment spacing. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/net/tcp.h |4 ++-- net/ipv4/tcp_input.c | 10 +- net/ipv4/tcp_output.c |6 +++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index f28f382..16dfe3c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1066,8 +1066,8 @@ static inline void tcp_mib_init(void) TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1); } -/*from STCP */ -static inline void clear_all_retrans_hints(struct tcp_sock *tp){ +/* from STCP */ +static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { tp-lost_skb_hint = NULL; tp-scoreboard_skb_hint = NULL; tp-retransmit_skb_hint = NULL; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 09b6b1d..89162a9 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1670,7 +1670,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp-high_seq = tp-frto_highmark; TCP_ECN_queue_cwr(tp); - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1741,7 +1741,7 @@ void tcp_enter_loss(struct sock *sk, int how) /* Abort FRTO algorithm if one is in progress */ tp-frto_counter = 0; - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) @@ -2106,7 +2106,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) /* There is something screwy going on with the retrans hints after an undo */ - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -2199,7 +2199,7 @@ static int tcp_try_undo_loss(struct sock *sk) TCP_SKB_CB(skb)-sacked = ~TCPCB_LOST; } - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); DBGUNDO(sk, partial loss); tp-lost_out = 0; @@ -2656,7 +2656,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tp-packets_out -= tcp_skb_pcount(skb); tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } if (ackedFLAG_ACKED) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index cbe8bf6..f46d24b 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -687,7 +687,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss BUG_ON(len skb-len); - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); nsize = skb_headlen(skb) - len; if (nsize 0) nsize = 0; @@ -1719,7 +1719,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m tcp_skb_pcount(next_skb) != 1); /* changing transmit queue under us so clear hints */ - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); /* Ok. We will be able to collapse the packet. */ tcp_unlink_write_queue(next_skb, sk); @@ -1792,7 +1792,7 @@ void tcp_simple_retransmit(struct sock *sk) } } - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); if (!lost) return; -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/9] [TCP]: Move accounting from tso_acked to clean_rtx_queue
The accounting code is pretty much the same, so it's a shame we do it in two places. I'm not too sure if added fully_acked check in MTU probing is really what we want perhaps the added end_seq could be used in the after() comparison. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 75 + 1 files changed, 32 insertions(+), 43 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 89162a9..d340fd5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2528,14 +2528,12 @@ static void tcp_rearm_rto(struct sock *sk) } } -static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, -__u32 now, __s32 *seq_rtt) +static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *scb = TCP_SKB_CB(skb); __u32 seq = tp-snd_una; __u32 packets_acked; - int acked = 0; /* If we get here, the whole TSO packet has not been * acked. @@ -2548,36 +2546,11 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, packets_acked -= tcp_skb_pcount(skb); if (packets_acked) { - __u8 sacked = scb-sacked; - - acked |= FLAG_DATA_ACKED; - if (sacked) { - if (sacked TCPCB_RETRANS) { - if (sacked TCPCB_SACKED_RETRANS) - tp-retrans_out -= packets_acked; - acked |= FLAG_RETRANS_DATA_ACKED; - *seq_rtt = -1; - } else if (*seq_rtt 0) - *seq_rtt = now - scb-when; - if (sacked TCPCB_SACKED_ACKED) - tp-sacked_out -= packets_acked; - if (sacked TCPCB_LOST) - tp-lost_out -= packets_acked; - if (sacked TCPCB_URG) { - if (tp-urg_mode - !before(seq, tp-snd_up)) - tp-urg_mode = 0; - } - } else if (*seq_rtt 0) - *seq_rtt = now - scb-when; - - tp-packets_out -= packets_acked; - BUG_ON(tcp_skb_pcount(skb) == 0); BUG_ON(!before(scb-seq, scb-end_seq)); } - return acked; + return packets_acked; } /* Remove acknowledged frames from the retransmission queue. */ @@ -2587,6 +2560,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; __u32 now = tcp_time_stamp; + int fully_acked = 1; int acked = 0; int prior_packets = tp-packets_out; __s32 seq_rtt = -1; @@ -2595,6 +2569,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) while ((skb = tcp_write_queue_head(sk)) skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); + u32 end_seq; + u32 packets_acked; __u8 sacked = scb-sacked; /* If our packet is before the ack sequence we can @@ -2602,11 +2578,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) * the other end. */ if (after(scb-end_seq, tp-snd_una)) { - if (tcp_skb_pcount(skb) 1 - after(tp-snd_una, scb-seq)) - acked |= tcp_tso_acked(sk, skb, - now, seq_rtt); - break; + if (tcp_skb_pcount(skb) == 1 || + !after(tp-snd_una, scb-seq)) + break; + + packets_acked = tcp_tso_acked(sk, skb); + if (!packets_acked) + break; + + fully_acked = 0; + end_seq = tp-snd_una; + } else { + packets_acked = tcp_skb_pcount(skb); + end_seq = scb-end_seq; } /* Initial outgoing SYN's get put onto the write_queue @@ -2624,7 +2608,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } /* MTU probing checks */ - if (icsk-icsk_mtup.probe_size) { + if (fully_acked icsk-icsk_mtup.probe_size) { if (!after(tp-mtu_probe.probe_seq_end, TCP_SKB_CB(skb)-end_seq)) { tcp_mtup_probe_success(sk, skb); } @@ -2633,27 +2617,32 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32
[PATCH 6/9] [TCP] FRTO: Improve interoperability with other undo_marker users
Basically this change enables it, previously other undo_marker users were left with nothing. Reverse undo_marker logic completely to get it set right in CA_Loss. On the other hand, when spurious RTO is detected, clear it. Clearing might be too heavy for some scenarios but seems safe enough starting point for now and shouldn't have much effect except in majority of cases (if in any). By adding a new FLAG_ we avoid looping through write_queue when RTO occurs. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 42 +++--- 1 files changed, 27 insertions(+), 15 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 74accb0..948e79a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -104,6 +104,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained DSACK info */ +#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -1597,6 +1598,8 @@ void tcp_enter_frto(struct sock *sk) tp-undo_retrans = 0; skb = tcp_write_queue_head(sk); + if (TCP_SKB_CB(skb)-sacked TCPCB_RETRANS) + tp-undo_marker = 0; if (TCP_SKB_CB(skb)-sacked TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)-sacked = ~TCPCB_SACKED_RETRANS; tp-retrans_out -= tcp_skb_pcount(skb); @@ -1646,6 +1649,8 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) /* ...enter this if branch just for the first segment */ flag |= FLAG_DATA_ACKED; } else { + if (TCP_SKB_CB(skb)-sacked TCPCB_RETRANS) + tp-undo_marker = 0; TCP_SKB_CB(skb)-sacked = ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); } @@ -1661,7 +1666,6 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp-snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; tp-snd_cwnd_cnt = 0; tp-snd_cwnd_stamp = tcp_time_stamp; - tp-undo_marker = 0; tp-frto_counter = 0; tp-reordering = min_t(unsigned int, tp-reordering, @@ -2587,20 +2591,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) end_seq = scb-end_seq; } - /* Initial outgoing SYN's get put onto the write_queue -* just like anything else we transmit. It is not -* true data, and if we misinform our callers that -* this ACK acks real data, we will erroneously exit -* connection startup slow start one packet too -* quickly. This is severely frowned upon behavior. -*/ - if (!(scb-flags TCPCB_FLAG_SYN)) { - flag |= FLAG_DATA_ACKED; - } else { - flag |= FLAG_SYN_ACKED; - tp-retrans_stamp = 0; - } - /* MTU probing checks */ if (fully_acked icsk-icsk_mtup.probe_size !after(tp-mtu_probe.probe_seq_end, scb-end_seq)) { @@ -2613,6 +2603,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) tp-retrans_out -= packets_acked; flag |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; + if ((flag FLAG_DATA_ACKED) || + (packets_acked 1)) + flag |= FLAG_NONHEAD_RETRANS_ACKED; } else if (seq_rtt 0) { seq_rtt = now - scb-when; if (fully_acked) @@ -2634,6 +2627,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) } tp-packets_out -= packets_acked; + /* Initial outgoing SYN's get put onto the write_queue +* just like anything else we transmit. It is not +* true data, and if we misinform our callers that +* this ACK acks real data, we will erroneously exit +* connection startup slow start one packet too +* quickly. This is severely frowned upon behavior. +*/ + if (!(scb-flags TCPCB_FLAG_SYN)) { + flag |= FLAG_DATA_ACKED; + } else { + flag |= FLAG_SYN_ACKED; + tp-retrans_stamp = 0
[PATCH 7/9] [TCP] FRTO: Update sysctl documentation
Since the SACK enhanced FRTO was added, the code has been under test numerous times so remove experimental claim from the documentation. Also be a bit more verbose about the usage. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- Documentation/networking/ip-sysctl.txt | 17 - 1 files changed, 12 insertions(+), 5 deletions(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index 32c2e9d..6ae2fef 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -180,13 +180,20 @@ tcp_fin_timeout - INTEGER to live longer. Cf. tcp_max_orphans. tcp_frto - INTEGER - Enables F-RTO, an enhanced recovery algorithm for TCP retransmission + Enables Forward RTO-Recovery (F-RTO) defined in RFC4138. + F-RTO is an enhanced recovery algorithm for TCP retransmission timeouts. It is particularly beneficial in wireless environments where packet loss is typically due to random radio interference - rather than intermediate router congestion. If set to 1, basic - version is enabled. 2 enables SACK enhanced F-RTO, which is - EXPERIMENTAL. The basic version can be used also when SACK is - enabled for a flow through tcp_sack sysctl. + rather than intermediate router congestion. FRTO is sender-side + only modification. Therefore it does not require any support from + the peer, but in a typical case, however, where wireless link is + the local access link and most of the data flows downlink, the + faraway servers should have FRTO enabled to take advantage of it. + If set to 1, basic version is enabled. 2 enables SACK enhanced + F-RTO if flow uses SACK. The basic version can be used also when + SACK is in use though scenario(s) with it exists where FRTO + interacts badly with the packet counting of the SACK enabled TCP + flow. tcp_frto_response - INTEGER When F-RTO has detected that a TCP retransmission timeout was -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/9] [TCP]: Cleanup tcp_tso_acked and tcp_clean_rtx_queue
Implements following cleanups: - Comment re-placement (CodingStyle) - tcp_tso_acked() local (wrapper-like) variable removal (readability) - __-types removed (IMHO they make local variables jumpy looking and just was space) - acked - flag (naming conventions elsewhere in TCP code) - linebreak adjustments (readability) - nested if()s combined (reduced indentation) - clarifying newlines added Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 66 ++--- 1 files changed, 30 insertions(+), 36 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d340fd5..74accb0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2528,55 +2528,49 @@ static void tcp_rearm_rto(struct sock *sk) } } +/* If we get here, the whole TSO packet has not been acked. */ static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - __u32 seq = tp-snd_una; - __u32 packets_acked; + u32 packets_acked; - /* If we get here, the whole TSO packet has not been -* acked. -*/ - BUG_ON(!after(scb-end_seq, seq)); + BUG_ON(!after(TCP_SKB_CB(skb)-end_seq, tp-snd_una)); packets_acked = tcp_skb_pcount(skb); - if (tcp_trim_head(sk, skb, seq - scb-seq)) + if (tcp_trim_head(sk, skb, tp-snd_una - TCP_SKB_CB(skb)-seq)) return 0; packets_acked -= tcp_skb_pcount(skb); if (packets_acked) { BUG_ON(tcp_skb_pcount(skb) == 0); - BUG_ON(!before(scb-seq, scb-end_seq)); + BUG_ON(!before(TCP_SKB_CB(skb)-seq, TCP_SKB_CB(skb)-end_seq)); } return packets_acked; } -/* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) +/* Remove acknowledged frames from the retransmission queue. If our packet + * is before the ack sequence we can discard it as it's confirmed to have + * arrived at the other end. + */ +static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; - __u32 now = tcp_time_stamp; + u32 now = tcp_time_stamp; int fully_acked = 1; - int acked = 0; + int flag = 0; int prior_packets = tp-packets_out; - __s32 seq_rtt = -1; + s32 seq_rtt = -1; ktime_t last_ackt = net_invalid_timestamp(); - while ((skb = tcp_write_queue_head(sk)) - skb != tcp_send_head(sk)) { + while ((skb = tcp_write_queue_head(sk)) skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); u32 end_seq; u32 packets_acked; - __u8 sacked = scb-sacked; + u8 sacked = scb-sacked; - /* If our packet is before the ack sequence we can -* discard it as it's confirmed to have arrived at -* the other end. -*/ if (after(scb-end_seq, tp-snd_una)) { if (tcp_skb_pcount(skb) == 1 || !after(tp-snd_una, scb-seq)) @@ -2601,38 +2595,38 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) * quickly. This is severely frowned upon behavior. */ if (!(scb-flags TCPCB_FLAG_SYN)) { - acked |= FLAG_DATA_ACKED; + flag |= FLAG_DATA_ACKED; } else { - acked |= FLAG_SYN_ACKED; + flag |= FLAG_SYN_ACKED; tp-retrans_stamp = 0; } /* MTU probing checks */ - if (fully_acked icsk-icsk_mtup.probe_size) { - if (!after(tp-mtu_probe.probe_seq_end, TCP_SKB_CB(skb)-end_seq)) { - tcp_mtup_probe_success(sk, skb); - } + if (fully_acked icsk-icsk_mtup.probe_size + !after(tp-mtu_probe.probe_seq_end, scb-end_seq)) { + tcp_mtup_probe_success(sk, skb); } if (sacked) { if (sacked TCPCB_RETRANS) { if (sacked TCPCB_SACKED_RETRANS) tp-retrans_out -= packets_acked; - acked |= FLAG_RETRANS_DATA_ACKED; + flag |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; } else if (seq_rtt 0) { seq_rtt = now - scb-when; if (fully_acked) last_ackt = skb-tstamp
[PATCH 8/9] [TCP]: Enable SACK enhanced FRTO (RFC4138) by default
Most of the description that follows comes from my mail to netdev (some editing done): Main obstacle to FRTO use is its deployment as it has to be on the sender side where as wireless link is often the receiver's access link. Take initiative on behalf of unlucky receivers and enable it by default in future Linux TCP senders. Also IETF seems to interested in advancing FRTO from experimental [1]. How does FRTO help? === FRTO detects spurious RTOs and avoids a number of unnecessary retransmissions and a couple of other problems that can arise due to incorrect guess made at RTO (i.e., that segments were lost when they actually got delayed which is likely to occur e.g. in wireless environments with link-layer retransmission). Though FRTO cannot prevent the first (potentially unnecessary) retransmission at RTO, I suspect that it won't cost that much even if you have to pay for each bit (won't be that high percentage out of all packets after all :-)). However, usually when you have a spurious RTO, not only the first segment unnecessarily retransmitted but the *whole window*. It goes like this: all cumulative ACKs got delayed due to in-order delivery, then TCP will actually send 1.5*original cwnd worth of data in the RTO's slow-start when the delayed ACKs arrive (basically the original cwnd worth of it unnecessarily). In case one is interested in minimizing unnecessary retransmissions e.g. due to cost, those rexmissions must never see daylight. Besides, in the worst case the generated burst overloads the bottleneck buffers which is likely to significantly delay the further progress of the flow. In case of ll rexmissions, ACK compression often occurs at the same time making the burst very sharp edged (in that case TCP often loses most of the segments above high_seq = very bad performance too). When FRTO is enabled, those unnecessary retransmissions are fully avoided except for the first segment and the cwnd behavior after detected spurious RTO is determined by the response (one can tune that by sysctl). Basic version (non-SACK enhanced one), FRTO can fail to detect spurious RTO as spurious and falls back to conservative behavior. ACK lossage is much less significant than reordering, usually the FRTO can detect spurious RTO if at least 2 cumulative ACKs from original window are preserved (excluding the ACK that advances to high_seq). With SACK-enhanced version, the detection is quite robust. FRTO should remove the need to set a high lower bound for the RTO estimator due to delay spikes that occur relatively common in some environments (esp. in wireless/cellular ones). [1] http://www1.ietf.org/mail-archive/web/tcpm/current/msg02862.html Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 948e79a..02b549b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,7 +85,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 2; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -int sysctl_tcp_frto __read_mostly; +int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_nometrics_save __read_mostly; -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 9/9] [TCP]: Avoid clearing sacktag hint in trivial situations
There's no reason to clear the sacktag skb hint when small part of the rexmit queue changes. Account changes (if any) instead when fragmenting/collapsing. RTO/FRTO do not touch SACKED_ACKED bits so no need to discard SACK tag hint at all. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/net/tcp.h |6 +- net/ipv4/tcp_input.c | 14 -- net/ipv4/tcp_output.c | 12 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 16dfe3c..07b1faa 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1067,11 +1067,15 @@ static inline void tcp_mib_init(void) } /* from STCP */ -static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { +static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) { tp-lost_skb_hint = NULL; tp-scoreboard_skb_hint = NULL; tp-retransmit_skb_hint = NULL; tp-forward_skb_hint = NULL; +} + +static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { + tcp_clear_retrans_hints_partial(tp); tp-fastpath_skb_hint = NULL; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 02b549b..1092b5a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1674,7 +1674,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp-high_seq = tp-frto_highmark; TCP_ECN_queue_cwr(tp); - tcp_clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); } void tcp_clear_retrans(struct tcp_sock *tp) @@ -1714,10 +1714,14 @@ void tcp_enter_loss(struct sock *sk, int how) tp-bytes_acked = 0; tcp_clear_retrans(tp); - /* Push undo marker, if it was plain RTO and nothing -* was retransmitted. */ - if (!how) + if (!how) { + /* Push undo marker, if it was plain RTO and nothing +* was retransmitted. */ tp-undo_marker = tp-snd_una; + tcp_clear_retrans_hints_partial(tp); + } else { + tcp_clear_all_retrans_hints(tp); + } tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) @@ -1744,8 +1748,6 @@ void tcp_enter_loss(struct sock *sk, int how) TCP_ECN_queue_cwr(tp); /* Abort FRTO algorithm if one is in progress */ tp-frto_counter = 0; - - tcp_clear_all_retrans_hints(tp); } static int tcp_check_sack_reneging(struct sock *sk) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index f46d24b..cbb83ac 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -687,7 +687,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss BUG_ON(len skb-len); - tcp_clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); nsize = skb_headlen(skb) - len; if (nsize 0) nsize = 0; @@ -1718,9 +1718,6 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - /* changing transmit queue under us so clear hints */ - tcp_clear_all_retrans_hints(tp); - /* Ok. We will be able to collapse the packet. */ tcp_unlink_write_queue(next_skb, sk); @@ -1759,6 +1756,13 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m tcp_adjust_fackets_out(tp, skb, tcp_skb_pcount(next_skb)); tp-packets_out -= tcp_skb_pcount(next_skb); + + /* changed transmit queue under us so clear hints */ + tcp_clear_retrans_hints_partial(tp); + /* manually tune sacktag skb hint */ + if (tp-fastpath_skb_hint == next_skb) + tp-fastpath_skb_hint = skb; + sk_stream_free_skb(sk, next_skb); } } -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH - net-2.6.24 0/2] Introduce and use print_ip and print_ipv6
On Thu, 20 Sep 2007, Joe Perches wrote: On Thu, 2007-09-20 at 07:55 -0700, Randy Dunlap wrote: How large are the patches if you posted them for review instead of just referencing gits for them? (which cuts down on review possibilities) The v4 is ~130kb, the v6 ~35kb. There is a gitweb available at: print_ip: http://repo.or.cz/w/linux-2.6/trivial-mods.git?a=shortlog;h=print_ipv4 commit diff: http://repo.or.cz/w/linux-2.6/trivial-mods.git?a=commitdiff;h=1e3a30d5d8b49b3accca07cc84ecf6d977cacdd5 print_ipv6: http://repo.or.cz/w/linux-2.6/trivial-mods.git?a=shortlog;h=print_ipv6 commit diff: http://repo.or.cz/w/linux-2.6/trivial-mods.git?a=commitdiff;h=e96b794a57a164db84379e2baf5fe2622a5ae3bf ...Alternatively you could split it up a bit and send those smaller chunks for reviewing purposes only (even though it would be combined to a single big patch in the end). -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 9/9] [TCP]: Avoid clearing sacktag hint in trivial situations
On Thu, 20 Sep 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Thu, 20 Sep 2007 15:17:52 +0300 There's no reason to clear the sacktag skb hint when small part of the rexmit queue changes. Account changes (if any) instead when fragmenting/collapsing. RTO/FRTO do not touch SACKED_ACKED bits so no need to discard SACK tag hint at all. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] Applied, and I followed it up with this coding style fixlet. Yeah, that's for fixing it... ...Just didn't notice it was left wrong while doing things that required more thinking to get them right... -- i.
[PATCH netdev-2.6/ALL] DIV_ROUND_UP/roundup cleanup (under drivers/net/)
Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- drivers/net/arcnet/rfc1201.c|3 ++- drivers/net/atl1/atl1_main.c| 17 +++-- drivers/net/gianfar_ethtool.c |2 +- drivers/net/ibm_emac/ibm_emac_mal.h |3 ++- drivers/net/mlx4/alloc.c|3 ++- drivers/net/mlx4/icm.c |3 ++- drivers/net/s2io.c | 11 --- drivers/net/sb1250-mac.c|2 +- drivers/net/starfire.c |6 +++--- drivers/net/wan/sbni.c |2 +- drivers/net/xen-netfront.c |2 +- 11 files changed, 26 insertions(+), 28 deletions(-) diff --git a/drivers/net/arcnet/rfc1201.c b/drivers/net/arcnet/rfc1201.c index 460a095..126ee5b 100644 --- a/drivers/net/arcnet/rfc1201.c +++ b/drivers/net/arcnet/rfc1201.c @@ -23,6 +23,7 @@ * * ** */ +#include linux/kernel.h #include linux/module.h #include linux/init.h #include linux/if_arp.h @@ -493,7 +494,7 @@ static int prepare_tx(struct net_device *dev, struct archdr *pkt, int length, out-length = length - RFC1201_HDR_SIZE; out-dataleft = lp-outgoing.length; - out-numsegs = (out-dataleft + maxsegsize - 1) / maxsegsize; + out-numsegs = DIV_ROUND_UP(out-dataleft, maxsegsize); out-segnum = 0; BUGMSG(D_DURING, rfc1201 prep_tx: ready for %d-segment split diff --git a/drivers/net/atl1/atl1_main.c b/drivers/net/atl1/atl1_main.c index c8deefa..b198f70 100644 --- a/drivers/net/atl1/atl1_main.c +++ b/drivers/net/atl1/atl1_main.c @@ -51,6 +51,7 @@ */ #include linux/types.h +#include linux/kernel.h #include linux/netdevice.h #include linux/pci.h #include linux/spinlock.h @@ -1132,8 +1133,7 @@ static void atl1_update_rfd_index(struct atl1_adapter *adapter, { u16 num_buf; - num_buf = (rrd-xsz.xsum_sz.pkt_size + adapter-rx_buffer_len - 1) / - adapter-rx_buffer_len; + num_buf = DIV_ROUND_UP(rrd-xsz.xsum_sz.pkt_size, adapter-rx_buffer_len); if (rrd-num_buf == num_buf) /* clean alloc flag for bad rrd */ atl1_clean_alloc_flag(adapter, rrd, num_buf); @@ -1537,8 +1537,7 @@ static void atl1_tx_map(struct atl1_adapter *adapter, struct sk_buff *skb, if (first_buf_len proto_hdr_len) { len12 = first_buf_len - proto_hdr_len; - m = (len12 + ATL1_MAX_TX_BUF_LEN - 1) / - ATL1_MAX_TX_BUF_LEN; + m = DIV_ROUND_UP(len12, ATL1_MAX_TX_BUF_LEN); for (i = 0; i m; i++) { buffer_info = tpd_ring-buffer_info[tpd_next_to_use]; @@ -1578,7 +1577,7 @@ static void atl1_tx_map(struct atl1_adapter *adapter, struct sk_buff *skb, frag = skb_shinfo(skb)-frags[f]; lenf = frag-size; - m = (lenf + ATL1_MAX_TX_BUF_LEN - 1) / ATL1_MAX_TX_BUF_LEN; + m = DIV_ROUND_UP(lenf, ATL1_MAX_TX_BUF_LEN); for (i = 0; i m; i++) { buffer_info = tpd_ring-buffer_info[tpd_next_to_use]; if (unlikely(buffer_info-skb)) @@ -1681,8 +1680,7 @@ static int atl1_xmit_frame(struct sk_buff *skb, struct net_device *netdev) for (f = 0; f nr_frags; f++) { frag_size = skb_shinfo(skb)-frags[f].size; if (frag_size) - count += (frag_size + ATL1_MAX_TX_BUF_LEN - 1) / - ATL1_MAX_TX_BUF_LEN; + count += DIV_ROUND_UP(frag_size, ATL1_MAX_TX_BUF_LEN); } /* mss will be nonzero if we're doing segment offload (TSO/GSO) */ @@ -1697,9 +1695,8 @@ static int atl1_xmit_frame(struct sk_buff *skb, struct net_device *netdev) } /* need additional TPD ? */ if (proto_hdr_len != len) - count += (len - proto_hdr_len + - ATL1_MAX_TX_BUF_LEN - 1) / - ATL1_MAX_TX_BUF_LEN; + count += DIV_ROUND_UP(len - proto_hdr_len, + ATL1_MAX_TX_BUF_LEN); } } diff --git a/drivers/net/gianfar_ethtool.c b/drivers/net/gianfar_ethtool.c index 2470903..8352754 100644 --- a/drivers/net/gianfar_ethtool.c +++ b/drivers/net/gianfar_ethtool.c @@ -245,7 +245,7 @@ static unsigned int gfar_usecs2ticks(struct gfar_private *priv, unsigned int use /* Make sure we return a number greater than 0 * if usecs 0 */ - return ((usecs * 1000 + count - 1) / count); + return DIV_ROUND_UP(usecs * 1000, count); } /* Convert ethernet clock ticks to microseconds */ diff --git a/drivers/net/ibm_emac/ibm_emac_mal.h b/drivers/net
Re: [PATCH] make _minimum_ TCP retransmission timeout configurable
On Wed, 29 Aug 2007, David Miller wrote: From: Rick Jones [EMAIL PROTECTED] Date: Wed, 29 Aug 2007 16:06:27 -0700 I belive the biggest component comes from link-layer retransmissions. There can also be some short outtages thanks to signal blocking, tunnels, people with big hats and whatnot that the link-layer retransmissions are trying to address. The three seconds seems to be a value that gives the certainty that 99 times out of 10 the segment was indeed lost. The trace I've been sent shows clean RTTs ranging from ~200 milliseconds to ~7000 milliseconds. Thanks for the info. It's pretty easy to generate examples where we might have some sockets talking over interfaces on such a network and others which are not. Therefore, if we do this, a per-route metric is probably the best bet. Ilpo, I'm also very interested to see what you think of all of this :-) ...Haven't been too actively reading mails for a while until now, so I'm a bit late in response... I'll try to quickly summarize FRTO here. It's true that FRTO cannot prevent the first retransmission, yet I suspect that it won't cost that much even if you have to pay for each bit, won't be that high percentage out of all packets after all :-). However, usually when you have a spurious RTO, not only the first segment unnecessarily retransmitted but the *whole window*. It goes like this: all cumulative ACKs got delayed due to in-order delivery, then TCP will actually send 1.5*original cwnd worth of data in the RTO's slow-start when the delayed ACKs arrive (basically the original cwnd worth of it unnecessarily). In case one is interested in minimizing unnecessary retransmissions e.g. due to cost, those rexmissions must never see daylight. Besides, in the worst case the generated burst overloads the bottleneck buffers which is likely to significantly delay the further progress of the flow. In case of ll rexmissions, ACK compression often occurs at the same time making the burst very sharp edged (in that case TCP often loses most of the segments above high_seq = very bad performance too). When FRTO is enabled, those unnecessary retransmissions are fully avoided except for the first segment and the cwnd behavior after detected spurious RTO is determined by the response (one can tune that by sysctl). Basic version (non-SACK enhanced one), FRTO can fail to detect spurious RTO as spurious and falls back to conservative behavior. ACK lossage is much less significant than reordering, usually the FRTO can detect spurious RTO if at least 2 cumulative ACKs from original window are preserved (excluding the ACK that advances to high_seq). With SACK-enhanced version, the detection is quite robust. Of course one could jump to min_rto bandwagon instead, but it often ends up being more or less black magic and can still produce unwanted behavior unless one goes to ridicilously high minimum RTOs. Main obstacle to FRTO use is its deployment as it has to be on the sender side where as wireless link is often the receiver's access link but if one can tune tcp_min_rto (or equal) on the sender side, one could enable FRTO at will as well. Anyway, anything older than 2.6.22 is not going to give very good results with FRTO. FRTO code's maturity point of view, IMHO currently just unconditional clearing of undo_marker (in tcp_enter_frto_loss) is on the way of enabling FRTO in future kernels by default as it basically disables DSACK undoing, I'll try to solve that soon, has been on my todo list for too long already (don't currently have much time to devote to that though so 2.6.24-rc1 might come too early for me :-(). After that, it might be a good move to enable it in mainline by default if you agree... ...Uninteresting enough, even IETF seems to interested in advancing FRTO from experimental [1]. Another important thing to consider in cellular besides ll rexmissions is bandwidth allocation delay... We actually a week ago ran some measurements in a real umts network to determine buffer, one-way delay, etc. behavior (though YMMV depending on operators configuration etc.). Basically we saw 1 s delay spike when allocation delay occurs (it's very hard to predict when that happens due to other network users role). One-way propagation delay was around 50 ms, so 1500 bytes takes about 80 ms+ to transmit, so it's it order of magnitude larger than RTT but queue delay is probably large enough to prevent spurious RTOs due to allocation delay. Besides that, we saw some long latencies, up to 8-12 s, they could be due to ll retransmissions but their source is not yet verified to be the WWAN link as we had the phone connected through bluetooth (could interfere). A funny sidenote about the experiment, we found out what Linux cannot do (from userspace only): it seems to be unable to receive the same packet it has sent out to itself as we forced the packet out from eth0 by binding sending dev to eth0 and
[PATCH net-2.6.24] [NET]: DIV_ROUND_UP cleanup (part two)
Hopefully captured all single statement cases under net/. I'm not too sure if there is some policy about #includes that are guaranteed (ie., in the current tree) to be available through some other #included header, so I just added linux/kernel.h to each changed file that didn't #include it previously. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/net/sock.h |3 ++- net/bridge/br_stp_bpdu.c |2 +- net/dccp/ackvec.c|5 ++--- net/ieee80211/ieee80211_crypt_ccmp.c |5 +++-- net/ipv4/inet_diag.c |5 +++-- net/ipv4/inet_timewait_sock.c|4 ++-- net/ipv4/tcp.c |3 ++- net/mac80211/aes_ccm.c |5 +++-- net/mac80211/tx.c|2 +- net/sunrpc/svcsock.c |3 ++- 10 files changed, 21 insertions(+), 16 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index dfeb8b1..802c670 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -40,6 +40,7 @@ #ifndef _SOCK_H #define _SOCK_H +#include linux/kernel.h #include linux/list.h #include linux/timer.h #include linux/cache.h @@ -702,7 +703,7 @@ extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind); static inline int sk_stream_pages(int amt) { - return (amt + SK_STREAM_MEM_QUANTUM - 1) / SK_STREAM_MEM_QUANTUM; + return DIV_ROUND_UP(amt, SK_STREAM_MEM_QUANTUM); } static inline void sk_stream_mem_reclaim(struct sock *sk) diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c index 60112bc..14f0c88 100644 --- a/net/bridge/br_stp_bpdu.c +++ b/net/bridge/br_stp_bpdu.c @@ -64,7 +64,7 @@ static inline int br_get_ticks(const unsigned char *src) { unsigned long ticks = ntohs(get_unaligned((__be16 *)src)); - return (ticks * HZ + STP_HZ - 1) / STP_HZ; + return DIV_ROUND_UP(ticks * HZ, STP_HZ); } /* called under bridge lock */ diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index 3f8984b..83378f3 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -69,9 +69,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_sock *dp = dccp_sk(sk); struct dccp_ackvec *av = dp-dccps_hc_rx_ackvec; /* Figure out how many options do we need to represent the ackvec */ - const u16 nr_opts = (av-dccpav_vec_len + -DCCP_MAX_ACKVEC_OPT_LEN - 1) / - DCCP_MAX_ACKVEC_OPT_LEN; + const u16 nr_opts = DIV_ROUND_UP(av-dccpav_vec_len, +DCCP_MAX_ACKVEC_OPT_LEN); u16 len = av-dccpav_vec_len + 2 * nr_opts, i; u32 elapsed_time; const unsigned char *tail, *from; diff --git a/net/ieee80211/ieee80211_crypt_ccmp.c b/net/ieee80211/ieee80211_crypt_ccmp.c index b016b41..2e6b099 100644 --- a/net/ieee80211/ieee80211_crypt_ccmp.c +++ b/net/ieee80211/ieee80211_crypt_ccmp.c @@ -9,6 +9,7 @@ * more details. */ +#include linux/kernel.h #include linux/err.h #include linux/module.h #include linux/init.h @@ -241,7 +242,7 @@ static int ieee80211_ccmp_encrypt(struct sk_buff *skb, int hdr_len, void *priv) hdr = (struct ieee80211_hdr_4addr *)skb-data; ccmp_init_blocks(key-tfm, hdr, key-tx_pn, data_len, b0, b, s0); - blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN; + blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN); last = data_len % AES_BLOCK_LEN; for (i = 1; i = blocks; i++) { @@ -351,7 +352,7 @@ static int ieee80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv) ccmp_init_blocks(key-tfm, hdr, pn, data_len, b0, a, b); xor_block(mic, b, CCMP_MIC_LEN); - blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN; + blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN); last = data_len % AES_BLOCK_LEN; for (i = 1; i = blocks; i++) { diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index dbeacd8..06922da 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -11,6 +11,7 @@ * 2 of the License, or (at your option) any later version. */ +#include linux/kernel.h #include linux/module.h #include linux/types.h #include linux/fcntl.h @@ -112,7 +113,7 @@ static int inet_csk_diag_fill(struct sock *sk, } #endif -#define EXPIRES_IN_MS(tmo) ((tmo - jiffies) * 1000 + HZ - 1) / HZ +#define EXPIRES_IN_MS(tmo) DIV_ROUND_UP((tmo - jiffies) * 1000, HZ) if (icsk-icsk_pending == ICSK_TIME_RETRANS) { r-idiag_timer = 1; @@ -190,7 +191,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock *tw, r-id.idiag_dst[0]= tw-tw_daddr; r-idiag_state= tw-tw_substate; r-idiag_timer= 3; - r-idiag_expires = (tmo * 1000 + HZ - 1) / HZ; + r-idiag_expires = DIV_ROUND_UP(tmo * 1000, HZ); r-idiag_rqueue = 0; r-idiag_wqueue = 0
net-2.6.24 build broken (allyesconfig)
Hi, $ git-reset --hard net-2.6.24-origin HEAD is now at e2eb35e... [ATM]: Fix build errors after conversion to pr_debug() $ make allyesconfig ... $ make bzImage ... CC drivers/net/82596.o drivers/net/82596.c:1618:1: error: unterminated argument list invoking macro DEB drivers/net/82596.c: In function 'set_multicast_list': drivers/net/82596.c:1564: error: 'DEB' undeclared (first use in this function) drivers/net/82596.c:1564: error: (Each undeclared identifier is reported only once drivers/net/82596.c:1564: error: for each function it appears in.) drivers/net/82596.c:1564: error: expected ';' at end of input make[1]: *** [drivers/net/82596.o] Error 1 make: *** [drivers/net/82596.o] Error 2 Hmm, I would guess that [NET]: Introduce MAC_FMT/MAC_ARG broken it, though didn't verify it. The fix is left as an exercise of the reader (i.e., the solution wasn't too obvious for me :-) )... -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 4/5] [TCP]: Discard fuzzy SACK blocks
On Fri, 24 Aug 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Mon, 20 Aug 2007 16:16:32 +0300 SACK processing code has been a sort of russian roulette as no validation of SACK blocks is previously attempted. Besides, it is not very clear what all kinds of broken SACK blocks really mean (e.g., one that has start and end sequence numbers reversed). So now close the roulette once and for all. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] Thanks a lot for coding this up, I like it a lot, applied. I have some minor worries about the D-SACK lower bound, but it's probably OK and I'm just being paranoid :-) ...Please tell what is your concern rather than hinting :-), or is it just a hunch?... Elsewhere we do a similar checking anyway (I might eventually end up dropping this check in dsack as duplicate due to other planned changes but it's necessary still as the validation is being done in the mainloop after check_dsack call): /* D-SACK for already forgotten data... Do dumb counting. */ if (dup_sack !after(end_seq_0, prior_snd_una) after(end_seq_0, tp-undo_marker)) tp-undo_retrans--; ...It's natural that due to HW duplication that we get DSACK below undo_marker when state = CA_CWR. In general, they almost always mean exactly that, a HW duplication, so instead IMHO we should account them as DSACK lying bit in sack_ok and disable DSACK undos for that flow (similar case is !EVER_RETRANS skb gets DSACKed). In theory, it could be delayed rexmission or something but we've already lost our state already and cannot verify that, so choosing the conservative dsack-lying bit there seems fine and should even be right thing for majority of the cases anyway. I was planning to do something along those lines later... The key problem here was (in the previous version that was in tcp-2.6), that the whole validation business becomes rather useless, if any invalid SACK blocks (those that really aren't DSACK but due to bug, malicious or whatever reason) below snd_una gets accepted as DSACK, that's basically the thing I wanted to avoid as it will be significant for the half of the seqno space... Now there's hopefully a bit smaller window for such garbage... :-) Key exits from tcp_is_sackblock_valid reside before those checks anyway, so they shouldn't be that problematic in performance wise either. ...I was thinking of adding unlikely to the latter checks but wasn't too sure if that's wise thing to do as malicious entity could push TCP to do them (basically at will), Comments? One additional note: the mainloop operates anyway only in the seqno range above prior_snd_una (earlier skbs already being dropped), that can't ever be undo_marker (so some of the DSACK checks are not yet strictly necessary but I wanted to do things right from the beginning as I might end up re-placing validation soo ;-)). ...So our discussion currently probably covers seqno range that is not going to have any significance at all... :-) Maybe my Too old comment deserves some additional explanation: [PATCH] [TCP]: More verbose comment to DSACK validation Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c |5 - 1 files changed, 4 insertions(+), 1 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 8692d0b..cd187c6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1070,7 +1070,10 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, if (!before(start_seq, tp-undo_marker)) return 1; - /* Too old */ + /* Too old (it no longer has any significance to TCP state though +* it can be valid; for more complete explanation see comment above +* and similar validation done in tcp_check_dsack()) +*/ if (!after(end_seq, tp-undo_marker)) return 0; -- 1.5.0.6
[PATCH net-2.6.24] [NET] Cleanup: DIV_ROUND_UP
Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_output.c |6 +- net/key/af_key.c | 17 + 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 10b2e39..bca4ee2 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -646,11 +646,7 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned skb_shinfo(skb)-gso_size = 0; skb_shinfo(skb)-gso_type = 0; } else { - unsigned int factor; - - factor = skb-len + (mss_now - 1); - factor /= mss_now; - skb_shinfo(skb)-gso_segs = factor; + skb_shinfo(skb)-gso_segs = DIV_ROUND_UP(skb-len, mss_now); skb_shinfo(skb)-gso_size = mss_now; skb_shinfo(skb)-gso_type = sk-sk_gso_type; } diff --git a/net/key/af_key.c b/net/key/af_key.c index 5502df1..17b2a69 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -352,16 +352,14 @@ static int verify_address_len(void *p) switch (addr-sa_family) { case AF_INET: - len = sizeof(*sp) + sizeof(*sin) + (sizeof(uint64_t) - 1); - len /= sizeof(uint64_t); + len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), sizeof(uint64_t)); if (sp-sadb_address_len != len || sp-sadb_address_prefixlen 32) return -EINVAL; break; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) case AF_INET6: - len = sizeof(*sp) + sizeof(*sin6) + (sizeof(uint64_t) - 1); - len /= sizeof(uint64_t); + len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin6), sizeof(uint64_t)); if (sp-sadb_address_len != len || sp-sadb_address_prefixlen 128) return -EINVAL; @@ -386,14 +384,9 @@ static int verify_address_len(void *p) static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx) { - int len = 0; - - len += sizeof(struct sadb_x_sec_ctx); - len += sec_ctx-sadb_x_ctx_len; - len += sizeof(uint64_t) - 1; - len /= sizeof(uint64_t); - - return len; + return DIV_ROUND_UP(sizeof(struct sadb_x_sec_ctx) + + sec_ctx-sadb_x_ctx_len, + sizeof(uint64_t)); } static inline int verify_sec_ctx_len(void *p) -- 1.5.0.6
[PATCH net-2.6.24 0/5] TCP: Cleanups SACK block validation
Hi Dave, Here are couple of patches to net-2.6.24. The first three are trivial cleanups. The idea to the last two comes from tcp-2.6 but the validator has been heavily modified (and hopefully improved in the process :-)). I'm not sure though if checking for the undo_marker boundary crossing case is a bit over-engineering (inherited from the original version which already checked for that case). In addition, better names could be invented for MIBs, suggestions? -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 4/5] [TCP]: Discard fuzzy SACK blocks
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] SACK processing code has been a sort of russian roulette as no validation of SACK blocks is previously attempted. Besides, it is not very clear what all kinds of broken SACK blocks really mean (e.g., one that has start and end sequence numbers reversed). So now close the roulette once and for all. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 82 ++ 1 files changed, 82 insertions(+), 0 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2bf3d57..102aefa 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1001,7 +1001,86 @@ static void tcp_update_reordering(struct sock *sk, const int metric, *for retransmitted and already SACKed segment - reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. + * + * SACK block validation. + * -- + * + * SACK block range validation checks that the received SACK block fits to + * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. + * Note that SND.UNA is not included to the range though being valid because + * it means that the receiver is rather inconsistent with itself (reports + * SACK reneging when it should advance SND.UNA). + * + * Implements also blockage to start_seq wrap-around. Problem lies in the + * fact that though start_seq (s) is before end_seq (i.e., not reversed), + * there's no guarantee that it will be before snd_nxt (n). The problem + * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt + * wrap (s_w): + * + * - outs wnd - - wrapzone - + * u e n u_w e_w s n_w + * | | | | | | | + * |+--+- TCP seqno space --+--| + * ...-- 2^31 -| |... + * ... 2^31 --||... + * + * Current code wouldn't be vulnerable but it's better still to discard such + * crazy SACK blocks. Doing this check for start_seq alone closes somewhat + * similar case (end_seq after snd_nxt wrap) as earlier reversed check in + * snd_nxt wrap - snd_una region will then become well defined, i.e., + * equal to the ideal case (infinite seqno space without wrap caused issues). + * + * With D-SACK the lower bound is extended to cover sequence space below + * SND.UNA down to undo_marker, which is the last point of interest. Yet + * again, DSACK block must not to go across snd_una (for the same reason as + * for the normal SACK blocks, explained above). But there all simplicity + * ends, TCP might receive valid D-SACKs below that. As long as they reside + * fully below undo_marker they do not affect behavior in anyway and can + * therefore be safely ignored. In rare cases (which are more or less + * theoretical ones), the D-SACK will nicely cross that boundary due to skb + * fragmentation and packet reordering past skb's retransmission. To consider + * them correctly, the acceptable range must be extended even more though + * the exact amount is rather hard to quantify. However, tp-max_window can + * be used as an exaggerated estimate. */ +static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, + u32 start_seq, u32 end_seq) +{ + /* Too far in future, or reversed (interpretation is ambiguous) */ + if (after(end_seq, tp-snd_nxt) || !before(start_seq, end_seq)) + return 0; + + /* Nasty start_seq wrap-around check (see comments above) */ + if (!before(start_seq, tp-snd_nxt)) + return 0; + + /* In outstanding window? ...This is valid exit for DSACKs too. +* start_seq == snd_una is non-sensical (see comments above) +*/ + if (after(start_seq, tp-snd_una)) + return 1; + + if (!is_dsack || !tp-undo_marker) + return 0; + + /* ...Then it's D-SACK, and must reside below snd_una completely */ + if (!after(end_seq, tp-snd_una)) + return 0; + + if (!before(start_seq, tp-undo_marker)) + return 1; + + /* Too old */ + if (!after(end_seq, tp-undo_marker)) + return 0; + + /* Undo_marker boundary crossing (overestimates a lot). Known already: +* start_seq undo_marker and end_seq = undo_marker. +*/ + return !before(start_seq, end_seq - tp-max_window); +} + + static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, struct tcp_sack_block_wire *sp, int num_sacks, u32 prior_snd_una) @@ -1143,6 +1222,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int fack_count
[PATCH 2/5] [TCP]: tcp_packets_out_inc to tcp_output.c (no callers elsewhere)
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/net/tcp.h | 12 net/ipv4/tcp_output.c | 12 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 6d586ca..f28f382 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -614,18 +614,6 @@ static inline void tcp_dec_pcount_approx(__u32 *count, tcp_dec_pcount_approx_int(count, tcp_skb_pcount(skb)); } -static inline void tcp_packets_out_inc(struct sock *sk, - const struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - int orig = tp-packets_out; - - tp-packets_out += tcp_skb_pcount(skb); - if (!orig) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)-icsk_rto, TCP_RTO_MAX); -} - /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 1d65ce1..a61a3e3 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -61,6 +61,18 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; +static inline void tcp_packets_out_inc(struct sock *sk, + const struct sk_buff *skb) +{ + struct tcp_sock *tp = tcp_sk(sk); + int orig = tp-packets_out; + + tp-packets_out += tcp_skb_pcount(skb); + if (!orig) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)-icsk_rto, TCP_RTO_MAX); +} + static void update_send_head(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 1/5] [TCP]: Remove unnecessary wrapper tcp_packets_out_dec
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] Makes caller side more obvious, there's no need to have a wrapper for this oneliner! Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/net/tcp.h |6 -- net/ipv4/tcp_input.c |2 +- net/ipv4/tcp_output.c |2 +- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 7c65989..6d586ca 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -626,12 +626,6 @@ static inline void tcp_packets_out_inc(struct sock *sk, inet_csk(sk)-icsk_rto, TCP_RTO_MAX); } -static inline void tcp_packets_out_dec(struct tcp_sock *tp, - const struct sk_buff *skb) -{ - tp-packets_out -= tcp_skb_pcount(skb); -} - /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 96ced89..45ad32c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2548,7 +2548,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) last_ackt = skb-tstamp; } tcp_dec_pcount_approx(tp-fackets_out, skb); - tcp_packets_out_dec(tp, skb); + tp-packets_out -= tcp_skb_pcount(skb); tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); clear_all_retrans_hints(tp); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index a367917..1d65ce1 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1735,7 +1735,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m * it is better to underestimate fackets. */ tcp_dec_pcount_approx(tp-fackets_out, next_skb); - tcp_packets_out_dec(tp, next_skb); + tp-packets_out -= tcp_skb_pcount(next_skb); sk_stream_free_skb(sk, next_skb); } } -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 5/5] [TCP] MIB: Add counters for discarded SACK blocks
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] In DSACK case, some events are not extraordinary, such as packet duplication generated DSACK. They can arrive easily below snd_una when undo_marker is not set (TCP being in CA_Open), counting such DSACKs amoung SACK discards will likely just mislead if they occur in some scenario when there are other problems as well. Similarly, excessively delayed packets could cause normal DSACKs. Therefore, separate counters are allocated for DSACK events. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/linux/snmp.h |3 +++ net/ipv4/proc.c |3 +++ net/ipv4/tcp_input.c | 10 +- 3 files changed, 15 insertions(+), 1 deletions(-) diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 802b3a3..d24c554 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -231,6 +231,9 @@ enum LINUX_MIB_TCPABORTONLINGER, /* TCPAbortOnLinger */ LINUX_MIB_TCPABORTFAILED, /* TCPAbortFailed */ LINUX_MIB_TCPMEMORYPRESSURES, /* TCPMemoryPressures */ + LINUX_MIB_TCPSACKDISCARD, /* TCPSACKDiscard */ + LINUX_MIB_TCPDSACKIGNOREDOLD, /* TCPSACKIgnoredOld */ + LINUX_MIB_TCPDSACKIGNOREDNOUNDO,/* TCPSACKIgnoredNoUndo */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3b690cf..986d1c8 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -244,6 +244,9 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM(TCPAbortOnLinger, LINUX_MIB_TCPABORTONLINGER), SNMP_MIB_ITEM(TCPAbortFailed, LINUX_MIB_TCPABORTFAILED), SNMP_MIB_ITEM(TCPMemoryPressures, LINUX_MIB_TCPMEMORYPRESSURES), + SNMP_MIB_ITEM(TCPSACKDiscard, LINUX_MIB_TCPSACKDISCARD), + SNMP_MIB_ITEM(TCPDSACKIgnoredOld, LINUX_MIB_TCPDSACKIGNOREDOLD), + SNMP_MIB_ITEM(TCPDSACKIgnoredNoUndo, LINUX_MIB_TCPDSACKIGNOREDNOUNDO), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 102aefa..8692d0b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1222,8 +1222,16 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int fack_count; int dup_sack = (found_dup_sack (i == first_sack_index)); - if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) + if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) { + if (dup_sack) { + if (!tp-undo_marker) + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); + else + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD); + } else + NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); continue; + } skb = cached_skb; fack_count = cached_fack_count; -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/5] [TCP]: Rename tcp_ack_packets_out - tcp_rearm_rto
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] Only thing that tiny function does is rearming the RTO (if necessary), name it accordingly. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c |5 ++--- 1 files changed, 2 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 45ad32c..2bf3d57 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2406,8 +2406,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack, /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ - -static void tcp_ack_packets_out(struct sock *sk) +static void tcp_rearm_rto(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -2560,7 +2559,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) = inet_csk(sk)-icsk_ca_ops; tcp_ack_update_rtt(sk, acked, seq_rtt); - tcp_ack_packets_out(sk); + tcp_rearm_rto(sk); if (tcp_is_reno(tp)) tcp_remove_reno_sacks(sk, pkts_acked); -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] [IPv6]: Invalid semicolon after if statement
On Wed, 15 Aug 2007, David Miller wrote: From: Dave Jones [EMAIL PROTECTED] Date: Wed, 15 Aug 2007 19:52:20 -0400 On Wed, Aug 15, 2007 at 03:08:14PM -0700, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Thu, 16 Aug 2007 00:57:00 +0300 (EEST) A similar fix to netfilter from Eric Dumazet inspired me to look around a bit by using some grep/sed stuff as looking for this kind of bugs seemed easy to automate. This is one of them I found where it looks like this semicolon is not valid. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] Yikes! Makes you want to audit the entire tree for these things :-))) Indeed. Here's another one. Signed-off-by: Dave Jones [EMAIL PROTECTED] That got fixed the other day and is the A similar fix to netfilter from Eric Dumazet Ilpo is reffering to above :) ...heh, when I said a bit, I meant that it took a very little effort to check over the whole tree... :-) ...I've already reported all four things one could find from whole tree with this cmd (to the relevant parties), so no need for you to redo the effort :-) : $ for i in `find . -name '*.[ch]'`; do echo $i; sed -n -e '/^\t*if *[(].*[)] *; *$/ N' -e '/^\(\t*\)if *[(].*[)] *; *\n\1\t/ p' $i; done | tee result ...Basically it checks if the next line has more indentation than the if line. ...Obviously it will work only if the code follows current CodingStyle in indentation. I'm currently experimenting with indent preprocessing step... ...but I'm not yet sure if I can pull something useful out from that too :-) ...Better cmdlines could be invented, e.g. by manually checking every if (); lines once one has first automated separation of them from if () do_smthg(); lines (I haven't learned myself how to easily count/work with parenthesis pairs in a cmdline, which seems necessary here)... -- i.
Re: drivers/infiniband/mlx/mad.c misplaced ;
...I guess those guys hunting for broken busyloops in the other thread could also benefit from similar searching commands introduced in this thread... ...Ccing Satyam to caught their attention too. On Wed, Aug 15, 2007 at 05:40:11PM -0700, Joe Perches wrote: There's more than a few of these (not inspected). $ egrep -r --include=*.c \bif[[:space:]]*\([^\)]*\)[[:space:]]*\; * ...Hmm, I plugged in a preprocessor too to manage with non compliant coding styles :-). Please understand that the line numbers are not an exact match due to preprocessor changes: $ for i in `find . -name '*.[ch]'`; do echo $i; indent -npro -kr -i8 -ts8 -sob -l8000 -ss -ncs -cp1 -nhnl -st $i | egrep -n [[:space:]]if [(].*[)] ;$; done | grep -B1 ^[^.] ./arch/arm/mach-omap1/leds-innovator.c 97: if (led_state LED_STATE_ENABLED) ; -- ./arch/mips/sibyte/cfe/console.c 23: if (written 0) ; 32: if (written 0) ; -- ./arch/powerpc/kernel/legacy_serial.c 524:if (0) ; -- ./arch/powerpc/xmon/ppc-opc.c 938:else if (value == 0) ; -- ./arch/sh/boards/se/7343/io.c 137:if (0) ; -- ./arch/um/kernel/tt/tracer.c 254:if (WIFEXITED(status)) ; -- ./arch/x86_64/ia32/ptrace32.c 363:if (__copy_from_user(child-thread.i387.fxsave, u, sizeof(*u))) ; -- ./arch/x86_64/kernel/traps.c 801:if (eregs == (struct pt_regs *)eregs-rsp) ; -- ./drivers/atm/iphase.c 159:if (!desc1) ; -- ./drivers/isdn/capi/capiutil.c 456:else if (c = 0x0f) ; -- ./drivers/isdn/hisax/hfc_pci.c 125:if (Read_hfc(cs, HFCPCI_INT_S1)) ; 155:if (Read_hfc(cs, HFCPCI_INT_S1)) ; 1483: if (Read_hfc(cs, HFCPCI_INT_S1)) ; -- ./drivers/isdn/hisax/hfc_sx.c 377:if (Read_hfc(cs, HFCSX_INT_S1)) ; 407:if (Read_hfc(cs, HFCSX_INT_S2)) ; 1246: if (Read_hfc(cs, HFCSX_INT_S1)) ; -- ./drivers/media/video/video-buf.c 1141: if (q-bufs[i]) ; -- ./drivers/net/lp486e.c 777:if (lp-scb.command i596_timeout(dev, i596_cleanup_cmd, 100)) ; 785:if (lp-scb.command i596_timeout(dev, i596_reset, 100)) ; 794:if (lp-scb.command i596_timeout(dev, i596_reset(2), 400)) ; 820:if (lp-scb.command i596_timeout(dev, i596_add_cmd, 100)) ; 1146: if (lp-scb.command i596_timeout(dev, interrupt, 40)) ; 1192: if (lp-scb.command i596_timeout(dev, i596 interrupt, 100)) ; 1217: if (lp-scb.command i596_timeout(dev, i596_close, 200)) ; -- ./drivers/net/ni5010.c 273:if (dev-irq == 0xff) ; -- ./drivers/net/ni52.c 648:if (result TDR_LNK_OK) ; -- ./drivers/net/sun3_82586.c 498:if (result TDR_LNK_OK) ; -- ./drivers/pci/hotplug/ibmphp_core.c 418:else if (mode == BUS_MODE_PCI) ; 636:else if (mode == BUS_MODE_PCI) ; -- ./drivers/usb/gadget/file_storage.c 2480: if (protocol_is_scsi()) ; -- ./drivers/usb/host/uhci-debug.c 416:if (i = SKEL_ISO) ; 419:else if (!uhci-fsbr_is_on) ; -- ./drivers/usb/host/uhci-q.c 541:if (qh-skel == SKEL_ISO) ; -- ./drivers/usb/misc/usbtest.c 1401: if (status != 0) ; -- ./drivers/video/intelfb/intelfbdrv.c 337:if (get_opt_bool(this_opt, accel, accel)) ; 338:else if (get_opt_int(this_opt, vram, vram)) ; 339:else if (get_opt_bool(this_opt, hwcursor, hwcursor)) ; 340:else if (get_opt_bool(this_opt, mtrr, mtrr)) ; 341:else if (get_opt_bool(this_opt, fixed, fixed)) ; -- ./drivers/video/matrox/matroxfb_DAC1064.c 46: if (fvco = 10) ; -- ./drivers/video/matrox/matroxfb_maven.c 298:if (fvco = 1) ; 316:if (fvco = 10) ; -- ./fs/hfs/inode.c 72: if (!node) ; -- ./fs/hfsplus/inode.c 67: if (!node) ; -- ./fs/hostfs/hostfs_user.c 300:if (attrs-ia_valid HOSTFS_ATTR_CTIME) ; -- ./fs/xfs/xfs_bmap.c 2287: if (nullfb || XFS_FSB_TO_AGNO(mp, ap-rval) == fb_agno) ; -- ./fs/xfs/xfs_dir2.c 281:else if ((rval = xfs_dir2_isblock(tp, dp, v))) ; -- ./fs/xfs/xfs_iomap.c 248:if (io-io_flags XFS_IOCORE_RT) ; -- ./include/asm-cris/uaccess.h 255:if (n == 0) ; 303:if (n == 0) ; 351:if (n == 0) ; -- ./mm/swapfile.c 791:if (swcount = 1) ; -- ./net/core/pktgen.c 2256: if (pkt_dev-min_in6_daddr.s6_addr32[0] == 0 pkt_dev-min_in6_daddr.s6_addr32[1] == 0 pkt_dev-min_in6_daddr.s6_addr32[2] == 0 pkt_dev-min_in6_daddr.s6_addr32[3] == 0) ; -- ./net/irda/af_irda.c 1357: if (ret) ; 1358: else if (sk-sk_shutdown RCV_SHUTDOWN) ; -- ./net/irda/irnetlink.c 105:if (nla_put_string(msg, IRDA_NL_ATTR_IFNAME, dev-name)) ; -- ./net/netfilter/xt_u32.c 38: if (skb-len 4 || pos skb-len - 4) ; -- ./sound/pci/au88x0/au88x0_core.c 2076: if (vortex_adbdma_bufshift(vortex, i)) ; 2085: if (vortex_wtdma_bufshift(vortex, i)) ; --
Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures
On Thu, 16 Aug 2007, Herbert Xu wrote: We've been through that already. If it's a busy-wait it should use cpu_relax. I looked around a bit by using some command lines and ended up wondering if these are equal to busy-wait case (and should be fixed) or not: ./drivers/telephony/ixj.c 6674: while (atomic_read(j-DSPWrite) 0) 6675- atomic_dec(j-DSPWrite); ...besides that, there are couple of more similar cases in the same file (with braces)... -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] [IPv6]: Invalid semicolon after if statement
A similar fix to netfilter from Eric Dumazet inspired me to look around a bit by using some grep/sed stuff as looking for this kind of bugs seemed easy to automate. This is one of them I found where it looks like this semicolon is not valid. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv6/ipv6_sockglue.c |2 +- 1 files changed, 1 insertions(+), 1 deletions(-) diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index d684639..761a910 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -820,7 +820,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, return 0; len = min_t(unsigned int, len, ipv6_optlen(hdr)); - if (copy_to_user(optval, hdr, len)); + if (copy_to_user(optval, hdr, len)) return -EFAULT; return ipv6_optlen(hdr); } -- 1.5.0.6
Re: net-2.6.24 GIT state
On Thu, 9 Aug 2007, David Miller wrote: I applied everything up until the SACK validator to net-2.6.24 Ok, thanks. Everything I hit today which had not been posted before was trivial fix or a reasonable small cleanup. ...Yeah, I know that but don't want to give impression that something gets sneaked in... Why don't you just post the patches to the list one-by-one from now on so we can review them in parallel with merging into net-2.6.24? Ok, will do. ...I thought that at some point that would be wise way forward... -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-2.6.24] [TCP]: Update comment about highest_sack validity
This stale info came from the original idea, which proved to be unnecessarily complex, sacked_out 0 is easy to do and that when it's going to be needed anyway (it _can_ be valid also when sacked_out == 0 but there's not going to be a guarantee about it for now). Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/linux/tcp.h |3 ++- 1 files changed, 2 insertions(+), 1 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 1f12fa0..f8cf090 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -332,7 +332,8 @@ struct tcp_sock { struct tcp_sack_block_wire recv_sack_cache[4]; - u32 highest_sack; /* Start seq of globally highest revd SACK (valid only in slowpath) */ + u32 highest_sack; /* Start seq of globally highest revd SACK +* (validity guaranteed only if sacked_out 0) */ /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; -- 1.5.0.6
Re: net-2.6.24 GIT state
On Thu, 9 Aug 2007, David Miller wrote: Ok, as is clear for some the net-2.6.24 tree has been cut and is at: kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6.24.git I'll be going over the TCP bits Ilpo and I have been discussing ...In case I didn't make it clear previously, you'll notice that there are some non-submitted changes rebased as well on the top of it. then expect me to disappear back to driver hacking :-) As is, there should be enough in there for people to play with, regression test, and improve. At least these are trivial to take (after rebase, some come nicely even from the before tree): 40564051bdb237231e625ba674fdedf6e8373027 [TCP]: Remove num_acked0 checks from cong.ctrl mods pkts_acked 4c035cd78a6b60710b0dda4f62339df070f761c8 [TCP]: Add tcp_dec_pcount_approx int variant ed2753b0b463df41c10121ce494d51428047fcbc [TCP]: Move code from tcp_ecn.h to tcp*.c and tcp.h remove it c215314c6ea0261865d3df09f375016f3dcadeba [TCP]: Access to highest_sack obsoletes forward_cnt_hint 9e5f432fb247af2f0062c3c57d7f45d511692f26 [TCP] FRTO: remove unnecessary fackets/sacked_out recounting 96001b306c60cb969d456ac70e376220db97e54a [TCP]: Move Reno SACKed_out counter functions earlier 318cf753170367504cfac07ac89e97befb1ca501 [TCP]: Extract DSACK detection code from tcp_sacktag_write_queue(). a2539974b098065ebe02ad11a6411df4f56ba0ed [TCP]: Rexmit hint must be cleared instead of setting it - Could be combined with the extrator as it's really a bug fix, once rxmit cnt hint gets dropped, we can add this back as optimization :-). 2fea67411f0c89642fa4abc0490b65c7852a1202 [TCP]: Extracted rexmit hint clearing from the LOST marking code 1fba6548b2ecc2d513981898247472d1183329c5 [TCP]: Add highest_sack seqno, points to globally highest SACK ...I was claiming incorrectly last time that highest_sack stuff needs SACK-block validation, it does not as it takes seq from skb-seq, not from the receiver like I thought :-). ...Redoing the validator (the improved version) on top of net-2.6.24 is on my high priority list though, so maybe I'll have it already on today. Left_out removal should be safe too: after my analysis, it seems safe as most of the functions are if-modified-then-sync type, some sync always. Clean_rtx_queue leaves left_out to old state which is the hardest thing to track. However, TCP should always execute fastretrans_alert (that syncs near entry) if sacked_out+lost_out is (or was) non-zero, so it shouldn't leak stale state anywhere. Not going to fastretrans_alert would be IMHO a bug that should be fixed. I found one such case which is fixed in [TCP]: Keep state in Disorder also if only lost_out 0). Also FRTO that is called before fastretrans_alert sync at entry, so it shouldn't be a problem to it either. ...Taking IsReno/IsFack removal is probably going to be hardest one, as it will conflict a lot. ...Maybe some sed work in patches would reduce # of conflicts instead of trying cherry-pick+rebase. -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: net-2.6.24 GIT state
On Thu, 9 Aug 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Thu, 9 Aug 2007 12:50:51 +0300 (EEST) Ok, in order to get this started I merged the above patches into net-2.6.24 I'm exhausted and done for today. :-) :-) We can work on the other bits next. Next step: http://www.cs.helsinki.fi/u/ijjarvin/tcp-rebase/after-reorder/ ...I put some effort to reorganization and there's the result, please _don't_ take the new SACK block-validator yet as I haven't yet put it under any testing (it's the only difference this reordering of patches seems to have introduced :-)). The first five commits touch left_out (4 latest fix bug and flaws I made in the first commit), I could combine them into a single one if you want to. ...Notice that part of the next step includes work that hasn't yet been posted to netdev at all (was from my not yet submitted wip portion of it). Would you prefer me to post them using the usual procedure? ...Taking IsReno/IsFack removal is probably going to be hardest one, as it will conflict a lot. ...Maybe some sed work in patches would reduce # of conflicts instead of trying cherry-pick+rebase. Yes, such patch hand editing techniques can help in these cases. ...it helped a lot in the process :-). -- i.
Re: [RFC] stuff from tcp-2.6 partially merged to upcoming net-2.6.24?
On Wed, 8 Aug 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Tue, 7 Aug 2007 16:19:58 +0300 (EEST) Do you have any suggestion how I should proceed? Or do you perhaps object such partial merge completely? ...I could try to come up with a cleaned up patch series which has original and their bug fix parts combined to a single patch per change (would provide cleaner history and shouldn't be very hard to do either)... Thank you for doing the rebase. I agree with you that we should seperate out as much of the non-rb-tree stuff as possible and put it into net-2.6.24 ...It would also help a lot in case we at some point of time in the future decide to merge rb-tree stuff but then have to back up to have it merged in a cleaner state which is easier to revert than with cleanups combined... Besides once I get some additional bits done (haven't yet dared to do L-bit cascade surgery which will fix timedout loop entry bugs and allow dropping of retrans hint counter), long enough mm sit would be nice route to rbtree stuff since there it's getting at least bit more versitile testing that we alone can put to it... I'll will try hard to make time to look at your rebase and try to move things forward. ...I case you want to validate them, git-patch-id helps to exclude identical changes between before and after allowing you to spend more time on things that required non-trivial resolution... I've been all-consumed by the NAPI work and a driver I've been writing in the background for the past few weeks. ...I can try to help... I suggest that as first step we take all changes that do not cause any conflicts (I've already tried cherry-picks), only thing I'm not sure whether I should combine change+fix parts or keep them as they were in tcp-2.6 (I suggest we combine them but you may disagree..?). I can either post them as series or give you them in --pretty=oneline format if you want to cherry-pick them yourself (in that case having a common tree would help a bit as commit ids would match as well then)). However, at least highest_sack inclusion (no space loss as one hint counter can be then dropped and allows accurate fackets out which I've partially coded already) will need also SACK-block validator. But I basically have to redo as the validator patch was sacktag reorganization based previously (I've already found couple of improvements too to it's accuracy :-))... -- i.
Re: TCP's initial cwnd setting correct?...
On Tue, 7 Aug 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Mon, 6 Aug 2007 15:37:15 +0300 (EEST) ...Another thing that makes me wonder, is the tp-mss_cache 1460 check as based on rfc3390 (and also it's precursor rfc2414) with up to 2190 bytes MSS TCP can use 3 as initial cwnd... I did the research and my memory was at least partially right. Below is an old bogus change of mine and the later revert with Alexey's explanation. This seems to be dealing with receive window calculation issues, rather than snd_cwnd. But they might be related and you should consider this very seriously. ...Thanks about the info, I'll study it carefully and see what's relevant in here too. And anyway we're currently on the safer side as the potential issues only make it more conservative than the RFC allows, so there's no hurry to get it in until the solution is acceptable if there's indeed a need for a change. -- i.
[RFC] stuff from tcp-2.6 partially merged to upcoming net-2.6.24?
Hi Dave, ...Noticed you were planning to open net-2.6.24 tree... IMHO, part of the stuff in tcp-2.6 could be merged to 2.6.24. I suggest that most of the stuff which is not directly related to the rbtree, new lost marker, nor sacktag reorganization are taken. Some of those things are very trivial to take as they do not introduce have any conflicts. Besides that there are some stuff that would need some work if takes as they are built on top of stuff that will remain only in tcp-2.6 (includes left_out removal and IsReno/Fack conversion)... But if it's ok, I could try to come up with a solution even to them... Perhaps do this in two (or more) stages by first taking the trivial ones... I tried rebasing tcp-2.6 (there's some not yet submitted work on top of it too) to top of be1b685fe6c9928848b26b568eaa86ba8ce0046c, result is here: http://www.cs.helsinki.fi/u/ijjarvin/tcp-rebase/{before,after} ...There was at least one gotcha (sacktag's flag reset position change when sacktag_state is created). But all in all, conflicts weren't that hard to resolve... One may resolve some things differently than I did, so YMMV if you want to try that yourself... :-) ...I also diffed all.patch'es to see if there was some undesired side-effect from diff but didn't find any. Currently only compile tested. Do you have any suggestion how I should proceed? Or do you perhaps object such partial merge completely? ...I could try to come up with a cleaned up patch series which has original and their bug fix parts combined to a single patch per change (would provide cleaner history and shouldn't be very hard to do either)... -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
TCP's initial cwnd setting correct?...
I'm I missing something or why it seems to me that whenever rtt stuff is reset, also tcp_init_cwnd is not called to do RFC3390-like initial window setting (remains then in 2 which is set by tcp_v4_init_sock)... Is this done on purpose or what? ...Another thing that makes me wonder, is the tp-mss_cache 1460 check as based on rfc3390 (and also it's precursor rfc2414) with up to 2190 bytes MSS TCP can use 3 as initial cwnd... ...Compile tested patch below in case these are valid concerns... ...Goto logic could be cleaner (somebody has any suggestion for better way to structure it?) -- i. [PATCH] [TCP]: Use IW also when RTT vars get reset update to match RFC Previously when RTT got reset, initial window was not being set. Initial window defined RFC3390 (and it's precursor 2414) allows using initial window of three as long as MSS 2190 bytes. Also update RFC2414 references as it's obsoleted by RFC3390. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 32 +--- net/ipv4/tcp_output.c |2 +- 2 files changed, 18 insertions(+), 16 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index eb96864..89794ec 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -805,13 +805,13 @@ void tcp_update_metrics(struct sock *sk) } } -/* Numbers are taken from RFC2414. */ +/* Numbers are taken from RFC3390. */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); if (!cwnd) { - if (tp-mss_cache 1460) + if (tp-mss_cache = 2190) cwnd = 2; else cwnd = (tp-mss_cache 1095) ? 3 : 4; @@ -897,22 +897,24 @@ static void tcp_init_metrics(struct sock *sk) } tcp_set_rto(sk); tcp_bound_rto(sk); - if (inet_csk(sk)-icsk_rto TCP_TIMEOUT_INIT !tp-rx_opt.saw_tstamp) - goto reset; - tp-snd_cwnd = tcp_init_cwnd(tp, dst); - tp-snd_cwnd_stamp = tcp_time_stamp; - return; + + if (inet_csk(sk)-icsk_rto TCP_TIMEOUT_INIT !tp-rx_opt.saw_tstamp) { reset: - /* Play conservative. If timestamps are not -* supported, TCP will fail to recalculate correct -* rtt, if initial rto is too small. FORGET ALL AND RESET! -*/ - if (!tp-rx_opt.saw_tstamp tp-srtt) { - tp-srtt = 0; - tp-mdev = tp-mdev_max = tp-rttvar = TCP_TIMEOUT_INIT; - inet_csk(sk)-icsk_rto = TCP_TIMEOUT_INIT; + /* Play conservative. If timestamps are not +* supported, TCP will fail to recalculate correct +* rtt, if initial rto is too small. FORGET ALL AND RESET! +*/ + if (!tp-rx_opt.saw_tstamp tp-srtt) { + tp-srtt = 0; + tp-mdev = tp-mdev_max = tp-rttvar = TCP_TIMEOUT_INIT; + inet_csk(sk)-icsk_rto = TCP_TIMEOUT_INIT; + } } + + tp-snd_cwnd = tcp_init_cwnd(tp, dst); + tp-snd_cwnd_stamp = tcp_time_stamp; + return; } static void tcp_update_reordering(struct sock *sk, const int metric, diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3abe22e..4f9be23 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -209,7 +209,7 @@ void tcp_select_initial_window(int __space, __u32 mss, } /* Set initial window to value enough for senders, -* following RFC2414. Senders, not following this RFC, +* following RFC3390. Senders, not following this RFC, * will be satisfied with 2. */ if (mss (1*rcv_wscale)) { -- 1.5.0.6
Re: TCP's initial cwnd setting correct?...
On Mon, 6 Aug 2007, Ilpo Järvinen wrote: ...Goto logic could be cleaner (somebody has any suggestion for better way to structure it?) ...I could probably move the setting of snd_cwnd earlier to avoid this problem if this seems a valid fix at all. -- i.
Re: TCP SACK issue, hung connection, tcpdump included
On Tue, 31 Jul 2007, Darryl L. Miles wrote: I've been able to capture a tcpdump from both ends during the problem and its my belief there is a bug in 2.6.20.1 (at the client side) in that it issues a SACK option for an old sequence which the current window being advertised is beyond it. This is the most concerning issue as the integrity of the sequence numbers doesn't seem right (to my limited understanding anyhow). You probably didn't check the reference I explicitly gave to those who are not familiar how DSACK works, just in case you didn't pick it up last time, here it is again for you: RFC2883... However, if DSACKs really bother you still (though it shouldn't :-)), IIRC I also told you how you're able to turn it off (tcp_dsack sysctl) but I assure you that it's not a bug but feature called DSACK [RFC2883], there's _absolutely_ nothing wrong with it, instead, it would be wrong to _not_ send the below snd_una SACK in this scenario when tcp_dsack set to 1. There is another concern of why the SERVER performed a retransmission in the first place, when the tcpdump shows the ack covering it has been seen. There are only three possible reasons to this thing: 1) The ACK didn't reach the SERVER (your logs prove this to not be the case) 2) The ACK got discarded by the SERVER 3) The SERVER (not the client) is buggy and sends an incorrect retransmission ...So we have just two options remaining... I have made available the full dumps at: http://darrylmiles.org/snippets/lkml/20070731/ Thanks about these... Based on a quick check, it is rather clear that the SERVER is for some reason discarding the packets it's receiving: 04:11:26.833935 IP CLIENT.43726 SERVER.ssh: P 4239:4287(48) ack 28176 win 501 nop,nop,timestamp 819646456 16345815 04:11:27.132425 IP SERVER.ssh CLIENT.43726: . 26016:27464(1448) ack 4239 win 2728 nop,nop,timestamp 17096579 819458864 04:11:27.230081 IP CLIENT.43726 SERVER.ssh: . ack 28176 win 501 nop,nop,timestamp 819646555 16345815,nop,nop Notice, (cumulative) ack field didn't advance though new data arrived, and for the record, it's in advertised window too. There are no DSACK in here so your theory about below snd_una SACK won't help to explain this one at all... We'll just have to figure out why it's discarding it. And there's even more to prove this... This sequence is interesting from the client side: 03:58:56.419034 IP SERVER.ssh CLIENT.43726: . 26016:27464(1448) ack 4239 win 2728 nop,nop,timestamp 16345815 819458859 # S1 03:58:56.419100 IP CLIENT.43726 SERVER.ssh: . ack 27464 win 501 nop,nop,timestamp 819458884 16345815 # C1 03:58:56.422019 IP SERVER.ssh CLIENT.43726: P 27464:28176(712) ack 4239 win 2728 nop,nop,timestamp 16345815 819458859 # S2 03:58:56.422078 IP CLIENT.43726 SERVER.ssh: . ack 28176 win 501 nop,nop,timestamp 819458884 16345815 # C2 The above 4 packets look as expect to me. Then we suddenly see a retransmission of 26016:27464. 03:58:56.731597 IP SERVER.ssh CLIENT.43726: . 26016:27464(1448) ack 4239 win 2728 nop,nop,timestamp 16346128 819458864 # S3 ...Look at this on the retransmission: ... timestamp 16346128 819458864 ...it tells us what really got received by the TCP. The corresponding ACK with matching timestamp is, surprise, surprise, this one: 03:58:56.340180 IP CLIENT.43726 SERVER.ssh: . ack 26016 win 501 nop,nop,timestamp 819458864 16345734 ...thus the SERVER has _not_ received but discarded the subsequent cumulative ACKs!!! Therefore it's retransmitting from 26016 onward but never receives any reply as everything seems to get discarded... There was one bad checksum btw: 03:58:56.365662 IP (tos 0x10, ttl 64, id 28685, offset 0, flags [DF], proto 6, length: 764) SERVER.ssh CLIENT.43726: P [bad tcp cksum 6662 (-ef2b)!] 617734888:617735600(712) ack 2634113543 win 2728 nop,nop,timestamp 16345815 819458859 There are some changes in 2.6.22 that appear to affect TCP SACK handling does this fix a known issue ? There is no such known issue :-)... This issue has nothing to do with TCP SACK handling, since that code _won't_ be reached... We could verify that from the timestamps. But if you still insist that SACK under snd_una is the issue, please turn tcp_dsack to 0 on the CLIENT, you will not get them after that and you can be happy as your out-of-window SACK issue is then fixed :-)... ...Seriously, somebody else than me is probably better in suggesting what could cause the discarding at the SERVER in this case. SNMP stuff Dave was asking could help, you can find them from /proc/net/{netstat,snmp}... -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] [TCP]: Also handle snd_una changes in tcp_cwnd_down
On Wed, 1 Aug 2007, Ilpo Järvinen wrote: tcp_cwnd_down must check for it too as it should be conservative in case of collapse stuff and also when receiver is trying to lie (though that wouldn't be very successful/useful anyway). Note: - Separated also is_dupack and do_lost in fast_retransalert * Much cleaner look-and-feel now * This time it really fixes cumulative ACK with many new SACK blocks recovery entry (I claimed this fixes with last patch but it wasn't). TCP will now call tcp_update_scoreboard regardless of is_dupack when in recovery as long as there is enough fackets_out. - Introduce FLAG_SND_UNA_ADVANCED * Some prior_snd_una arguments are unnecessary after it - Added helper FLAG_ANY_PROGRESS to avoid long FLAG...|FLAG... constructs Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- Dave, BEWARE, I wasn't able to do anything else but compile test because Linus' tree didn't seem to boot on the machine I was trying to test it... :-( I think that to stable version only a small part of this change is necessary, not the full changeset. That should keep stable folks much happier... :-) I'll soon put my reduced proposal to: http://www.cs.helsinki.fi/u/ijjarvin/patches/stable-0001.patch The other patch (DSACK) can go to stable as is. I placed those two earlier sent bidir fixes and these two additional fixes on top of 2.6.22 and was finally able to have them tested on a bootable kernel (I had a boot failure on another machine too with 2.6.23-rc1 stuff). FACKNewReno/bidir and FACK/unidir tested, time-seq graphs were ok. Dave, please put these two patches to net-2.6 to complete bidir fix series. ...And please push to stable as well, take just the minimized fix portion of this [TCP]: Also handle snd_una changes in tcp_cwnd_down patch as I described above. Other cleanups in it can be put just to net-2.6. -- i.
Re: TCP SACK issue, hung connection, tcpdump included
...I dropped lkml, it's useless to bother them with this network related stuff... On Thu, 2 Aug 2007, Darryl Miles wrote: Ilpo Järvinen wrote: On Tue, 31 Jul 2007, Darryl L. Miles wrote: [...RFC bashing, snip...] * The older linux kernel for not being 100% SACK RFC compliant in its implementation ? Not a lot we can do about this now, but if we're able to identify there maybe backward compatibility issues with the same implementation thats a useful point to take forward. * The newer linux kernel for enabling D-SACK by default when RFC2883 doesn't even claim a cast iron case for D-SACK to be compatible with any 100% RFC compliant SACK implementation. Are you aware that D-SACK processing and generation has been part of the linux kernel TCP far before 2.6 series even begun... ...and it goes far beoynd that, 2.4.0 had it too (2.2.0 didn't seem to have it, never before have I read that one IIRC :-) ). Does Ilpo have a particular vested interest in D-SACK that should be disclosed? Sure :-). ...my interest was to show you that it's not a bug :-). So it is necessary to turn off a TCP option (that is enabled by default) to be sure to have reliable TCP connections (that don't lock up) in the bugfree Linux networking stack ? This is absurd. ...You'll have to turn a lot off to be compatible with everything around Internet, and still you would probably fail. Some people have to, e.g., to turn of window scaling to work-around buggy intermediate nodes (nat boxes or some firewalls), there's even a sysctl to workaround signed 16-bit window arithmetic bugs that's mostly legacy but I bet you can find host broken in that area too. Etc. Yet we don't off those by default. If such an option causes such a problem; then that option should not be enabled by default. ...Linux TCP has enabled by default option which are _known_ (at least nowadays) to cause bad problems and many of them are _still_ enabled... Browse archives if you don't believe me... And I'm relatively sure it will do so also in future though I'm not the maintainer nor anybody to make such decisions... rather than wallpaper over the cracks with the voodoo of turning things that are enabled by default off. ...I said that because it felt like you kept repeating that the generated DSACK block is a bug even though, like you now know, it's a feature, not a bug. :-) 2) The ACK got discarded by the SERVER I'd thought about that one, its a possibility. The server in question does have period of high UDP load on a fair number of UDP sockets at once (a few 1000). Both systems have 2Gb of RAM. The server has maybe just 250Mb of RSS of all apps combined. ...There are three independent signs in the log to indicate discard out of these 3 reasons. Whereas your theory _fails_ to explain some behavior in the log you presented, e.g., not updated timestamp which happen even _before_ the DSACK stuff?!?... I'll formulate this question: why didn't snd_una advance nor timestamp update though a cumulative ACK arrived? You can check for yourself (in server log): 03:58:56.384503 03:58:56.462583 03:58:56.465707 03:58:56.678546 ...I'm hoping SNMPs provide explanation to it. The client sent a SACK. But from understanding more about D-SACK, this is a valid D-SACK response, but it appears to confuse the older Linux kernel at the server end. ...Are you saying that it's confused by _DSACK_ just because it's only strange thing you seem to find from the log? I see other things in your log which are exceptional and point to elsewhere... Please don't neglect them... ...Problems occur already before that DSACK is received by the server end. Agreed on this. However discarding data is allowed (providing it is intentional discarding not a bug where the TCP stack is discarding segments it shouldn't), TCP should recover providing sufficient packets get through. But if one end decides to discard everything after time t, TCP _will not_ recover because sufficient packets won't get through... And that's what your log is telling me. Yes discarding is allowed but that wasn't the point, we're more interested here on why it got discarded rather than allowance of discarding. Forgive me if I am mistaken, but while the server reports a checksum error, the client did not. I took this to be a misreporting by tcpdump at the server, probably due to the e1000 network card checksum offloading ...That's probably the reason, I agree, these show up. Thought that also myself, besides, it wouldn't cause that kind of breakage anyway. So the SNMP data would show up intentional discards (due to memory/resource issues). So I'll get some of those too. The SNMP stats aren't so useful right now as the box has been rebooted since then but I shall attempt to capture /proc/net/* data, cause the problem, then capture /proc/net/* data again if those numbers can help. Good, thanks. -- i.
[PATCH 2/2] [TCP]: DSACK signals data receival, be conservative
In case a DSACK is received, it's better to lower cwnd as it's a sign of data receival. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c |5 - 1 files changed, 4 insertions(+), 1 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c3124e6..f030435 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -103,6 +103,7 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained DSACK info */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -966,12 +967,14 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ /* Check for D-SACK. */ if (before(ntohl(sp[0].start_seq), TCP_SKB_CB(ack_skb)-ack_seq)) { + flag |= FLAG_DSACKING_ACK; found_dup_sack = 1; tp-rx_opt.sack_ok |= 4; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); } else if (num_sacks 1 !after(ntohl(sp[0].end_seq), ntohl(sp[1].end_seq)) !before(ntohl(sp[0].start_seq), ntohl(sp[1].start_seq))) { + flag |= FLAG_DSACKING_ACK; found_dup_sack = 1; tp-rx_opt.sack_ok |= 4; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); @@ -1858,7 +1861,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag) struct tcp_sock *tp = tcp_sk(sk); int decr = tp-snd_cwnd_cnt + 1; - if ((flagFLAG_ANY_PROGRESS) || + if ((flag(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) || (IsReno(tp) !(flagFLAG_NOT_DUP))) { tp-snd_cwnd_cnt = decr1; decr = 1; -- 1.5.0.6
[PATCH 1/2] [TCP]: Also handle snd_una changes in tcp_cwnd_down
tcp_cwnd_down must check for it too as it should be conservative in case of collapse stuff and also when receiver is trying to lie (though that wouldn't be very successful/useful anyway). Note: - Separated also is_dupack and do_lost in fast_retransalert * Much cleaner look-and-feel now * This time it really fixes cumulative ACK with many new SACK blocks recovery entry (I claimed this fixes with last patch but it wasn't). TCP will now call tcp_update_scoreboard regardless of is_dupack when in recovery as long as there is enough fackets_out. - Introduce FLAG_SND_UNA_ADVANCED * Some prior_snd_una arguments are unnecessary after it - Added helper FLAG_ANY_PROGRESS to avoid long FLAG...|FLAG... constructs Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- Dave, BEWARE, I wasn't able to do anything else but compile test because Linus' tree didn't seem to boot on the machine I was trying to test it... :-( I think that to stable version only a small part of this change is necessary, not the full changeset. That should keep stable folks much happier... :-) I'll soon put my reduced proposal to: http://www.cs.helsinki.fi/u/ijjarvin/patches/stable-0001.patch The other patch (DSACK) can go to stable as is. net/ipv4/tcp_input.c | 34 ++ 1 files changed, 18 insertions(+), 16 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 378ca8a..c3124e6 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -102,11 +102,13 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ +#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) +#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) #define IsReno(tp) ((tp)-rx_opt.sack_ok == 0) #define IsFack(tp) ((tp)-rx_opt.sack_ok 2) @@ -1856,7 +1858,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag) struct tcp_sock *tp = tcp_sk(sk); int decr = tp-snd_cwnd_cnt + 1; - if ((flagFLAG_FORWARD_PROGRESS) || + if ((flagFLAG_ANY_PROGRESS) || (IsReno(tp) !(flagFLAG_NOT_DUP))) { tp-snd_cwnd_cnt = decr1; decr = 1; @@ -2107,15 +2109,13 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) * tcp_xmit_retransmit_queue(). */ static void -tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, - int prior_packets, int flag) +tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int is_dupack = (tp-snd_una == prior_snd_una -(!(flagFLAG_NOT_DUP) || - ((flagFLAG_DATA_SACKED) - (tp-fackets_out tp-reordering; + int is_dupack = !(flag(FLAG_SND_UNA_ADVANCED|FLAG_NOT_DUP)); + int do_lost = is_dupack || ((flagFLAG_DATA_SACKED) + (tp-fackets_out tp-reordering)); /* Some technical things: * 1. Reno does not count dupacks (sacked_out) automatically. */ @@ -2192,14 +2192,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* F. Process state. */ switch (icsk-icsk_ca_state) { case TCP_CA_Recovery: - if (prior_snd_una == tp-snd_una) { + if (!(flag FLAG_SND_UNA_ADVANCED)) { if (IsReno(tp) is_dupack) tcp_add_reno_sack(sk); } else { int acked = prior_packets - tp-packets_out; if (IsReno(tp)) tcp_remove_reno_sacks(sk, acked); - is_dupack = tcp_try_undo_partial(sk, acked); + do_lost = tcp_try_undo_partial(sk, acked); } break; case TCP_CA_Loss: @@ -2215,7 +2215,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Loss is undone; fall through to processing in Open state. */ default: if (IsReno(tp)) { - if (tp-snd_una != prior_snd_una) + if (flag FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); if (is_dupack) tcp_add_reno_sack(sk); @@ -2264,7 +2264,7
Re: [PATCH net-2.6 1/2] [TCP]: Fix ratehalving with bidirectional flows
On Tue, 31 Jul 2007, Stephen Hemminger wrote: I noticed no difference in the two flow tests. That is not a bad thing, just that this test doesn't hit that code. ...I'm not too sure about your test setup but the bugs I fixed only cover cases that occur if flow is bidirectional (and obviously active in both directions at the same time), they won't occur in a case of unidirectional transfer or in request-reply style connections (well, in the latter case if there's some overlap, it can have effect but that's usually not significant)... In case of bidirectional transfers, you *should* see some difference as previously the fast recovery was _very_ broken. Of course there could be other issue with large cwnd TCP that hides it by going to RTO still, but at least over 384k/200ms link (DBP sized buffers, IIRC), these change behavior very dramatically, mainly in the initial slow-start overshoot recovery because in there losses per RTT is so high number compared to what is experienced later on. One or a few losses are usually recovered without RTO when congestion happens later on. The anomaly is that first flow does slow start then gets loss and ends up reducing it's window size all the way to the bottom, finally it recovers. This happens with Cubic, H-TCP and others as well; if the queue in the network is large enough, they don't handle the initial loss well. ...TCP related stuff that changed in /proc/net/netstat might shed some light to this if none of the given explinations please you... :-) See the graph. What exactly do you mean by RENO in the title, I mean what's tcp_sack set to? There is occassionally a bit confusion in that respect in the terminology @ netdev, I've used to reno refering to non-SACK stuff elsewhere but in here that's not always the case... Usually it's possible to derive the correct interpretation from the context, but in this case I'm not too sure... :-) What I often have often seen with non-SACK TCP is that initial slow-start exhausts even very large advertised window on high DBP link and then due to draining of ACK feedback, gets RTOed... That usually shows up as long lasting recovery where one segment per RTT is recovered and new data is being sent as duplicate ACKs arrive with nearly constant rate until the window limit is hit (but I cannot see such periond in the graph you posted, so I guess it's not the explanation in this case). And if your RENO refers to something with SACK, that's not going to explain it anyway. ...Another nasty one I know is RED+ECN, though I'd say it's a bit far fetched one, as ECN cannot be used nicely in retransmission, a retransmission gets dropped instead of marking if RED wanted to mark. I guess that doesn't occur in your test case either? -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-2.6 0/2] [TCP]: Fix bidirectional brokeness
Hi Dave, While testing frto with bidirection TCP a while (months) ago, I encountered time-seq graphs which made absolutely no sense as if recoveries only completed after RTO. As a result, I noticed that rate-halving has problem when a flow is bidirection but testing the patch has been on my todo list for ages. ...Finally, here it is... ...Please think the first one a bit because there might be some corner cases with reno. While testing it I came across with an additional issue that can occur with bidirectional traffic. And, even better, the second fix seems to also solves a third issue which affects both unidirectional and bidirectional flows, though it's a marginal case (cumulative ACK that causes a larger number of new SACKed skbs). I've never seen that third one to occur but it's there if you get enough ACK losses, subsequent ACKs (if one gets them) do solve it so that's not a very bad issue to begin with... I've verified these from time-seq graphs on top of tcp-2.6, which had some additional (mostly cleanup and the rebase I promised you earlier) though I added the tp-fackets_out tp-reordering check afterwards as it seems necessary to avoid going to lost marker too often (wouldn't have had any effect in my test case anyway). ...Please consider to net-2.6 and to stable too. These will generate you some hassle when you rebase tcp-2.6. Btw, you forgot to push tcp-2.6 out last time though I could assume it's state... :-) In case you're going to now push it out, could you please drop [TCP]: Remove num_acked0 checks from cong.ctrl mods pkts_acked from it as it seems to be out of place in tcp-2.6, I can resubmit it to net-2.6.24 when you open it (unless you want to put it to net-2.6 directly as it's rather trivial one). -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-2.6 1/2] [TCP]: Fix ratehalving with bidirectional flows
Actually, the ratehalving seems to work too well, as cwnd is reduced on every second ACK even though the packets in flight remains unchanged. Recoveries in a bidirectional flows suffer quite badly because of this, both NewReno and SACK are affected. After this patch, rate halving is performed for ACK only if packets in flight was supposedly changed too. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 23 +-- 1 files changed, 13 insertions(+), 10 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 0163051..767f92c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1848,19 +1848,22 @@ static inline u32 tcp_cwnd_min(const struct sock *sk) } /* Decrease cwnd each second ack. */ -static void tcp_cwnd_down(struct sock *sk) +static void tcp_cwnd_down(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); int decr = tp-snd_cwnd_cnt + 1; + + if ((flagFLAG_FORWARD_PROGRESS) || + (IsReno(tp) !(flagFLAG_NOT_DUP))) { + tp-snd_cwnd_cnt = decr1; + decr = 1; - tp-snd_cwnd_cnt = decr1; - decr = 1; + if (decr tp-snd_cwnd tcp_cwnd_min(sk)) + tp-snd_cwnd -= decr; - if (decr tp-snd_cwnd tcp_cwnd_min(sk)) - tp-snd_cwnd -= decr; - - tp-snd_cwnd = min(tp-snd_cwnd, tcp_packets_in_flight(tp)+1); - tp-snd_cwnd_stamp = tcp_time_stamp; + tp-snd_cwnd = min(tp-snd_cwnd, tcp_packets_in_flight(tp)+1); + tp-snd_cwnd_stamp = tcp_time_stamp; + } } /* Nothing was retransmitted or returned timestamp is less @@ -2057,7 +2060,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) } tcp_moderate_cwnd(tp); } else { - tcp_cwnd_down(sk); + tcp_cwnd_down(sk, flag); } } @@ -2257,7 +2260,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, if (is_dupack || tcp_head_timedout(sk)) tcp_update_scoreboard(sk); - tcp_cwnd_down(sk); + tcp_cwnd_down(sk, flag); tcp_xmit_retransmit_queue(sk); } -- 1.5.0.6
[PATCH net-2.6 2/2] [TCP]: Bidir flow must not disregard SACK blocks for lost marking
It's possible that new SACK blocks that should trigger new LOST markings arrive with new data (which previously made is_dupack false). In addition, I think this fixes a case where we get a cumulative ACK with enough SACK blocks to trigger the fast recovery (is_dupack would be false there too). I'm not completely pleased with this solution because readability of the code is somewhat questionable as 'is_dupack' in SACK case is no longer about dupacks only but would mean something like 'lost_marker_work_todo' too... But because of Eifel stuff done in CA_Recovery, the FLAG_DATA_SACKED check cannot be placed to the if statement which seems attractive solution. Nevertheless, I didn't like adding another variable just for that either... :-) Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c |5 - 1 files changed, 4 insertions(+), 1 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 767f92c..cfe6ac7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2109,7 +2109,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int is_dupack = (tp-snd_una == prior_snd_una !(flagFLAG_NOT_DUP)); + int is_dupack = (tp-snd_una == prior_snd_una +(!(flagFLAG_NOT_DUP) || + ((flagFLAG_DATA_SACKED) + (tp-fackets_out tp-reordering; /* Some technical things: * 1. Reno does not count dupacks (sacked_out) automatically. */ -- 1.5.0.6
Re: [PATCH net-2.6 1/2] [TCP]: Fix ratehalving with bidirectional flows
On Mon, 30 Jul 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Mon, 30 Jul 2007 20:18:45 +0300 (EEST) Actually, the ratehalving seems to work too well, as cwnd is reduced on every second ACK even though the packets in flight remains unchanged. Recoveries in a bidirectional flows suffer quite badly because of this, both NewReno and SACK are affected. After this patch, rate halving is performed for ACK only if packets in flight was supposedly changed too. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] This fix looks great, and I see no potential problems with Reno after doing a bit of auditing. Applied and I'll push to -stable too. I think it's probably good to add tp-snd_una != prior_snd_una check there too... It's not going to make a large difference, mostly just to be conservative when skb collapse stuff got done (and maybe to annoy cheaters too though I couldn't at this point figure out how they could abuse it)... ...I think I can come up with that on Wednesday, so please hold stable push until that. -- i.
Re: [PATCH net-2.6 2/2] [TCP]: Bidir flow must not disregard SACK blocks for lost marking
On Mon, 30 Jul 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Mon, 30 Jul 2007 20:19:40 +0300 (EEST) I'm not completely pleased with this solution because readability of the code is somewhat questionable as 'is_dupack' in SACK case is no longer about dupacks only but would mean something like 'lost_marker_work_todo' too... But because of Eifel stuff done in CA_Recovery, the FLAG_DATA_SACKED check cannot be placed to the if statement which seems attractive solution. Nevertheless, I didn't like adding another variable just for that either... :-) I don't mind the complex conditionals so much in loss handling, they are almost inevitable. However I believe they could be simplified as a lot of pieces of code ask similar if not identical questions. We could ask several of these things up-front, regardless of path we will take (reno, DSACK, reorder, FRTO, etc.) and pass the answers along in a bitmask. We do that to some extent already with how we analyze the retransmit queue at the beginning of ACK processing. That's true, ...and my thought too as I was thinking of adding FLAG_SND_UNA_ADVANCED (!= FLAG_DATA_ACKED) (or it's opposite as I haven't yet checked which way it is more useful). That could help in many places and probably reduces compare pressure here and there as the bitmask operation can be done in one compare where it now has to do two compares, one for snd_una and other for the flag. ...I'll keep an eye on other possible bits too... :-) -- i.
Re: TCP SACK issue, hung connection, tcpdump included
On Sun, 29 Jul 2007, Willy Tarreau wrote: On Sun, Jul 29, 2007 at 06:59:26AM +0100, Darryl L. Miles wrote: CLIENT = Linux 2.6.20.1-smp [Customer build] SERVER = Linux 2.6.9-55.ELsmp [Red Hat Enterprise Linux AS release 4 (Nahant Update 5)] The problems start around time index 09:21:39.860302 when the CLIENT issues a TCP packet with SACK option set (seemingly for a data segment which has already been seen) from that point on the connection hangs. ...That's DSACK and it's being correctly sent. To me, it seems unlikely to be the cause for this breakage... Where was the capture taken ? on CLIENT or on SERVER (I suspect client from the timers) ? ...I would guess the same based on SYN timestamps (and from the DSACK timestamps)... A possible, but very unlikely reason would be an MTU limitation somewhere, because the segment which never gets correctly ACKed is also the largest one in this trace. Limitation for 48 byte segments? You have to be kidding... :-) But yes, it seems that one of the directions is dropping packets for some reason though I would not assume MTU limitation... Or did you mean some other segment? Also, if everything is right with the packets on the server side, then it would means that it's the RHEL kernel which is buggy (which does not mean that the same bug does not exist in mainline). ...There are two independed signs that server side is not getting the packets (ack field is not advanced nor do the retransmissions of an old data cease) so IMHO that would the first thing to rule out. Full dump available via email. 09:12:13.444999 IP CLIENT.50727 SERVER.ssh: S 2919512080:2919512080(0) win 5840 mss 1460,sackOK,timestamp 799860259 0,nop,wscale 7 09:12:13.535643 IP SERVER.ssh CLIENT.50727: S 492909547:492909547(0) ack 2919512081 win 5792 mss 1460,sackOK,timestamp 7126976 799860259,nop,wscale 2 09:12:13.535717 IP CLIENT.50727 SERVER.ssh: . ack 1 win 46 nop,nop,timestamp 799860282 7126976 09:12:13.665481 IP SERVER.ssh CLIENT.50727: P 1:24(23) ack 1 win 1448 nop,nop,timestamp 7127074 799860282 09:12:13.665895 IP CLIENT.50727 SERVER.ssh: . ack 24 win 46 nop,nop,timestamp 799860314 7127074 09:12:13.666044 IP CLIENT.50727 SERVER.ssh: P 1:23(22) ack 24 win 46 nop,nop,timestamp 799860314 7127074 09:12:13.776318 IP SERVER.ssh CLIENT.50727: . ack 23 win 1448 nop,nop,timestamp 7127216 799860314 ...SNIPPED SUCCESSFUL TRAFFIC... 09:21:39.490740 IP SERVER.ssh CLIENT.50727: P 18200:18464(264) ack 2991 win 2728 nop,nop,timestamp 7692910 81727 09:21:39.490775 IP CLIENT.50727 SERVER.ssh: . ack 18464 win 378 nop,nop,timestamp 81755 7692910 09:21:39.860245 IP SERVER.ssh CLIENT.50727: . 12408:13856(1448) ack 2991 win 2728 nop,nop,timestamp 7693293 81749 09:21:39.860302 IP CLIENT.50727 SERVER.ssh: . ack 18464 win 378 nop,nop,timestamp 81847 7692910,nop,nop,sack sack 1 {12408:13856} 09:21:40.453440 IP SERVER.ssh CLIENT.50727: . 12408:13856(1448) ack 2991 win 2728 nop,nop,timestamp 7693887 81749 09:21:40.453495 IP CLIENT.50727 SERVER.ssh: . ack 18464 win 378 nop,nop,timestamp 81996 7692910,nop,nop,sack sack 1 {12408:13856} 09:21:41.641821 IP SERVER.ssh CLIENT.50727: . 12408:13856(1448) ack 2991 win 2728 nop,nop,timestamp 7695075 81749 09:21:41.641938 IP CLIENT.50727 SERVER.ssh: . ack 18464 win 378 nop,nop,timestamp 82293 7692910,nop,nop,sack sack 1 {12408:13856} 09:21:44.017552 IP SERVER.ssh CLIENT.50727: . 12408:13856(1448) ack 2991 win 2728 nop,nop,timestamp 7697451 81749 09:21:44.017622 IP CLIENT.50727 SERVER.ssh: . ack 18464 win 378 nop,nop,timestamp 82887 7692910,nop,nop,sack sack 1 {12408:13856} 09:21:48.770051 IP SERVER.ssh CLIENT.50727: . 12408:13856(1448) ack 2991 win 2728 nop,nop,timestamp 7702203 81749 09:21:48.770099 IP CLIENT.50727 SERVER.ssh: . ack 18464 win 378 nop,nop,timestamp 84075 7692910,nop,nop,sack sack 1 {12408:13856} 09:21:58.274061 IP SERVER.ssh CLIENT.50727: . 12408:13856(1448) ack 2991 win 2728 nop,nop,timestamp 7711707 81749 09:21:58.274180 IP CLIENT.50727 SERVER.ssh: . ack 18464 win 378 nop,nop,timestamp 86450 7692910,nop,nop,sack sack 1 {12408:13856} 09:22:17.282035 IP SERVER.ssh CLIENT.50727: . 12408:13856(1448) ack 2991 win 2728 nop,nop,timestamp 7730715 81749 09:22:17.282153 IP CLIENT.50727 SERVER.ssh: . ack 18464 win 378 nop,nop,timestamp 800011202 7692910,nop,nop,sack sack 1 {12408:13856} 09:22:55.298955 IP SERVER.ssh CLIENT.50727: . 12408:13856(1448) ack 2991 win 2728 nop,nop,timestamp 7768731 81749 09:22:55.299023 IP CLIENT.50727 SERVER.ssh: . ack 18464 win 378 nop,nop,timestamp 800020705 7692910,nop,nop,sack sack 1 {12408:13856} 09:24:11.329853 IP SERVER.ssh CLIENT.50727: . 12408:13856(1448) ack 2991 win 2728 nop,nop,timestamp 7844763 81749 09:24:11.329923 IP CLIENT.50727 SERVER.ssh: . ack
Re: TCP SACK issue, hung connection, tcpdump included
On Sun, 29 Jul 2007, Willy Tarreau wrote: On Sun, Jul 29, 2007 at 11:26:00AM +0300, Ilpo Järvinen wrote: On Sun, 29 Jul 2007, Willy Tarreau wrote: On Sun, Jul 29, 2007 at 06:59:26AM +0100, Darryl L. Miles wrote: CLIENT = Linux 2.6.20.1-smp [Customer build] SERVER = Linux 2.6.9-55.ELsmp [Red Hat Enterprise Linux AS release 4 (Nahant Update 5)] The problems start around time index 09:21:39.860302 when the CLIENT issues a TCP packet with SACK option set (seemingly for a data segment which has already been seen) from that point on the connection hangs. ...That's DSACK and it's being correctly sent. To me, it seems unlikely to be the cause for this breakage... Where was the capture taken ? on CLIENT or on SERVER (I suspect client from the timers) ? ...I would guess the same based on SYN timestamps (and from the DSACK timestamps)... A possible, but very unlikely reason would be an MTU limitation somewhere, because the segment which never gets correctly ACKed is also the largest one in this trace. Limitation for 48 byte segments? You have to be kidding... :-) But yes, it seems that one of the directions is dropping packets for some reason though I would not assume MTU limitation... Or did you mean some other segment? No, I was talking about the 1448 bytes segments. But in fact I don't believe it much because the SACKs are always retransmitted just afterwards. Ah, but it's ACKed correctly right below it...: [...snip...] 09:21:39.490740 IP SERVER.ssh CLIENT.50727: P 18200:18464(264) ack 2991 win 2728 nop,nop,timestamp 7692910 81727 09:21:39.490775 IP CLIENT.50727 SERVER.ssh: . ack 18464 win 378 nop,nop,timestamp 81755 7692910 09:21:39.860245 IP SERVER.ssh CLIENT.50727: . 12408:13856(1448) ack 2991 win 2728 nop,nop,timestamp 7693293 81749 ...segment below snd_una arrived = snd_una remains 18464, receiver generates a duplicate ACK: 09:21:39.860302 IP CLIENT.50727 SERVER.ssh: . ack 18464 win 378 nop,nop,timestamp 81847 7692910,nop,nop,sack sack 1 {12408:13856} The cumulative ACK field of it covers _everything_ below 18464 (i.e., it ACKs them), including the 1448 bytes in 12408:13856... In addition, the SACK block is DSACK information [RFC2883] telling explicitly the address of the received duplicate block. However, if this ACK doesn't reach the SERVER TCP, RTO is triggered and the first not yet cumulatively ACKed segment is retransmitted (I guess cumulative ACKs up to 12408 arrived without problems to the SERVER): 09:21:40.453440 IP SERVER.ssh CLIENT.50727: . 12408:13856(1448) ack 2991 win 2728 nop,nop,timestamp 7693887 81749 [...snip...] BTW, some information are missing. It would have been better if the trace had been read with tcpdump -Svv. We would have got seq numbers and ttl. Also, we do not know if there's a firewall between both sides. Sometimes, some IDS identify attacks in crypted traffic and kill connections. It might have been the case here, with the connection closed one way on an intermediate firewall. Yeah, firewall or some other issue, I'd say it's quite unlikely a bug in TCP because behavior to both directions indicate client - sender blackhole independently of each other... -- i.
Re: TCP SACK issue, hung connection, tcpdump included
On Sun, 29 Jul 2007, Willy Tarreau wrote: On Sun, Jul 29, 2007 at 12:28:04PM +0300, Ilpo Järvinen wrote: [...snip...] BTW, some information are missing. It would have been better if the trace had been read with tcpdump -Svv. We would have got seq numbers and ttl. Also, we do not know if there's a firewall between both sides. Sometimes, some IDS identify attacks in crypted traffic and kill connections. It might have been the case here, with the connection closed one way on an intermediate firewall. Yeah, firewall or some other issue, I'd say it's quite unlikely a bug in TCP because behavior to both directions indicate client - sender blackhole independently of each other... It would also be possible that something stupid between both ends simply drops packets with the SACK option set. Also something which sometimes happen is that some firewalls automatically translate sequence numbers but not necessarily SACK values, which could pretty well lead to this packet being received but ignored on the server side. ...One can toggle those off with /proc/sys/net/ipv4/tcp_dsack but I suspect DSACKs are not the cause because these retransmissions neither are making it through (there are many of them also earlier in the log, just quoted the easiest ones :-) ): 09:36:44.335591 IP CLIENT.50727 SERVER.ssh: P 2991:3039(48) ack 18464 win 378 nop,nop,timestamp 800227942 7692910 09:38:44.351950 IP CLIENT.50727 SERVER.ssh: P 2991:3039(48) ack 18464 win 378 nop,nop,timestamp 800257942 7692910 09:40:44.368172 IP CLIENT.50727 SERVER.ssh: P 2991:3039(48) ack 18464 win 378 nop,nop,timestamp 800287943 7692910 ...there are no SACKs involved in them, yet no cumulative ACK ever arrives from SERVER... -- i.
Re: [RFC 2/3] TCP: cubic - eliminate use of receive time stamp
Hi all, It's definately better to do rtt measurements in the proper callback like you're suggesting since the call to cong_avoid depends on too many factors (ECN, FRTO) that shouldn't inhibit rtt sampling though they, for a period, determine how cwnd must be set... -- i. On Thu, 19 Jul 2007, Stephen Hemminger wrote: Remove use of received timestamp option value from RTT calculation in Cubic. A hostile receiver may be returning a larger timestamp option than the original value. This would cause the sender to believe the malevolent receiver had a larger RTT and because Cubic tries to provide some RTT friendliness, the sender would then favor the liar. Instead, use the jiffie resolutionRTT value already computed and passed back after ack. Signed-off-by: Stephen Hemminger [EMAIL PROTECTED] --- a/net/ipv4/tcp_cubic.c2007-07-19 08:21:42.0 +0100 +++ b/net/ipv4/tcp_cubic.c2007-07-19 08:24:58.0 +0100 @@ -246,38 +246,12 @@ static inline void bictcp_update(struct ca-cnt = 1; } - -/* Keep track of minimum rtt */ -static inline void measure_delay(struct sock *sk) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); - u32 delay; - - /* No time stamp */ - if (!(tp-rx_opt.saw_tstamp tp-rx_opt.rcv_tsecr) || - /* Discard delay samples right after fast recovery */ - (s32)(tcp_time_stamp - ca-epoch_start) HZ) - return; - - delay = (tcp_time_stamp - tp-rx_opt.rcv_tsecr)3; - if (delay == 0) - delay = 1; - - /* first time call or link delay decreases */ - if (ca-delay_min == 0 || ca-delay_min delay) - ca-delay_min = delay; -} - static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - if (data_acked) - measure_delay(sk); - if (!tcp_is_cwnd_limited(sk, in_flight)) return; @@ -337,14 +311,30 @@ static void bictcp_state(struct sock *sk static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) { const struct inet_connection_sock *icsk = inet_csk(sk); + struct bictcp *ca = inet_csk_ca(sk); + u32 delay; if (cnt 0 icsk-icsk_ca_state == TCP_CA_Open) { - struct bictcp *ca = inet_csk_ca(sk); cnt -= ca-delayed_ack ACK_RATIO_SHIFT; ca-delayed_ack += cnt; } -} + /* Some calls are for duplicates without timetamps */ + if (rtt_us 0) + return; + + /* Discard delay samples right after fast recovery */ + if ((s32)(tcp_time_stamp - ca-epoch_start) HZ) + return; + + delay = usecs_to_jiffies(rtt_us) 3; + if (delay == 0) + delay = 1; + + /* first time call or link delay decreases */ + if (ca-delay_min == 0 || ca-delay_min delay) + ca-delay_min = delay; +} static struct tcp_congestion_ops cubictcp = { .init = bictcp_init, - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Linux 2.6.22: Leak r=1 1
On Wed, 11 Jul 2007, Sami Farin wrote: That's right, so descriptive is the new Linux kernel 2.6.22. Linux safari.finland.fbi 2.6.22-cfs-v19 #3 SMP Tue Jul 10 00:22:25 EEST 2007 i686 i686 i386 GNU/Linux [EMAIL PROTECTED] /proc/sys/net/ipv4]# grep . * ...snip... tcp_frto:1 ...This is fully unrelated to the issue but I'm a bit curious who enabled frto on your machine (since it's disabled by default), did you do it by yourself or the distribution perhaps? This is interesting because frto to be useful in large scale, sender must have it enabled and therefore it's usually not under control of the host that is attached to the wireless access link. ...In case some distribution (yours) is already enabling it, the deployment is somewhat proceeding already :-)... ...And we also get incremental testing of the code too... -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Linux 2.6.22: Leak r=1 1
On Wed, 11 Jul 2007, Sami Farin wrote: That's right, so descriptive is the new Linux kernel 2.6.22. Took a while to grep what is leaking. Linux safari.finland.fbi 2.6.22-cfs-v19 #3 SMP Tue Jul 10 00:22:25 EEST 2007 i686 i686 i386 GNU/Linux Just normal Internet usage, azureus for example =) I think this is easy to trigger. I guess those packet loss periods help you to reproduce it so easily. # ss -n|wc -l 870 # ping -A 80.223.96.1 PING 80.223.96.1 (80.223.96.1) 56(84) bytes of data. 64 bytes from 80.223.96.1: icmp_seq=1 ttl=255 time=431 ms ... --- 80.223.96.1 ping statistics --- 40 packets transmitted, 25 received, 37% packet loss, time 17954ms rtt min/avg/max/mdev = 406.000/467.758/530.983/29.384 ms, pipe 2, ipg/ewma 460.361/456.381 ms But ploss is only temporary (when I am downloading with azureus =) , when only uploading (95% of bandwidth used) rtt avg = 32ms). # dmesg|grep Leak [114992.191011] Leak r=1 4 [124231.713348] Leak r=1 4 [142807.938284] Leak r=1 4 [142999.674521] Leak r=1 1 [143177.462073] Leak r=1 4 [143230.001570] Leak r=1 4 [143232.982560] Leak r=1 4 [143234.537096] Leak r=1 4 [143297.927760] Leak r=1 4 [143300.633603] Leak r=1 4 [143302.172917] Leak r=1 4 [143357.083193] Leak r=1 1 [143361.780879] Leak r=1 4 [143413.706490] Leak r=1 4 [143552.996598] Leak r=1 1 [EMAIL PROTECTED] /proc/sys/net/ipv4]# grep . * [...snip...] tcp_frto:1 I suspect this is the main ingrediment to trigger these leaks, well, I'm pretty sure of... Sami, please test the patch included below, Dave can then put that one to net-2.6 and to stable too. This is sort of poking to dark still... But it's pretty much the only place where the SACKED_RETRANS bit is touched without checking first that the adjustment can safely be made (and all SACKED_RETRANS changes in 2.6.22 are FRTO related as well). Most likely something cleared SACKED_RETRANS bit underneath FRTO and in tcp_enter_frto_loss I just blindly assumed that it's still there. While the patch below probably works, and leaks are no more, I'd like to get bottom of this by really figuring out what caused the SACKED_RETRANS bit to get cleared in the first place (wasn't expecting this happen while I wrote the FRTO). I guess that it could be lost retransmit loop in sacktag but again I've no concrete proof for that yet. Because for that to trigger, something must have allowed sending skbs past the snd_nxt at the time of the RTO, which too must be prevented during FRTO! Thus there could be other issues while this is just a sympthom of the main problem. I'd be interested to study some tcpdumps that relate to Leak cases you're seeing. Could you record some Sami? I'm not sure though how one can figure out the timestamp relation between the kernel log and a tcpdump log... Anyway, for this debugging, you should use a debug version of this patch with WARN_ON to get exact timestamp of the event since the leak print may occur much later on, I put one available at http://www.cs.helsinki.fi/u/ijjarvin/patches/ . Other candidates for the cause are even less likely. The first two are self-standing so that this patch is going to be necessary as long as fuzzy SACK blocks are allowed to be received and processed in sacktag (regardless there turns to be additional problem triggering this one or not): - DSACK touching snd_una (receiver is pretty inconsistent with itself because snd_una wasn't advanced). - 2xRTO and SACK @ snd_una (same note as above) - snd_una advanced full skb _without_ FLAG_DATA_ACKED being set (unlikely) - Head not retransmitted on RTO when FRTO was enabled (no, it's not going to be this one) It cannot be double SACKED_RETRANS because another msg would be printed already in tcp_retransmit_skb. Anyway, I'll be offline over the weekend starting from the Friday morning. ...thanks for your report Sami. -- i. [PATCH] [TCP]: Verify the presence of RETRANS bit when leaving FRTO For yet unknown reason, something cleared SACKED_RETRANS bit underneath FRTO. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c |4 +++- 1 files changed, 3 insertions(+), 1 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 69f9f1e..4e5884a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1398,7 +1398,9 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) * waiting for the first ACK and did not get it)... */ if ((tp-frto_counter == 1) !(flagFLAG_DATA_ACKED)) { - tp-retrans_out += tcp_skb_pcount(skb); + /* For some reason this R-bit might get cleared? */ + if (TCP_SKB_CB(skb)-sacked TCPCB_SACKED_RETRANS) + tp-retrans_out += tcp_skb_pcount(skb); /* ...enter this if branch just for the first segment */ flag
Re: Linux 2.6.22: Leak r=1 1
On Thu, 12 Jul 2007, Sami Farin wrote: On Thu, Jul 12, 2007 at 10:53:57 +0300, Ilpo Järvinen wrote: On Wed, 11 Jul 2007, Sami Farin wrote: That's right, so descriptive is the new Linux kernel 2.6.22. Took a while to grep what is leaking. Linux safari.finland.fbi 2.6.22-cfs-v19 #3 SMP Tue Jul 10 00:22:25 EEST 2007 i686 i686 i386 GNU/Linux Just normal Internet usage, azureus for example =) I think this is easy to trigger. I guess those packet loss periods help you to reproduce it so easily. ... I'd be interested to study some tcpdumps that relate to Leak cases you're seeing. Could you record some Sami? I'm not sure though how one can figure I now have 300 MB capture and several newretarded music videos... And 10 WARNINGs and 0 Leak printk's. Thanks, every warning would have lead to a Leak print later on (not necessarily with 1-to-1 relation but every pending warning would be acknowledged by a single Leak print). So every time the WARNING triggers, inconsistency would have been result without the patch. 2007-07-12 12:03:18.910712500 4[ 1318.606826] WARNING: at net/ipv4/tcp_input.c:1402 tcp_enter_frto_loss() ...snip... This is MAYBE the guilty connection if timestamps are to be believed: ...snip... I think you got the correct connection... Thanks. The problem seems to be related to FIN (a case that wouldn't have occurred to me without your log, thanks again :-))... I think that the patch I suggested should be fine (and it fixes the fuzzy sack block issues as well) though I still have problem in figuring out what's the exact path of execution on each ACK near the end of the connection (the sent packets are misplaced in the shown dump but the original order can be reconstructed from IP identifiers and TCP timestamps). Well, haven't gotten Leaks anymore after applying the patch. I'd have been a bit surprised if they would have still been there with the patch... -- i.
Re: [PATCH 2.6.22] TCP: Make TCP_RTO_MAX a variable (take 2)
On Thu, 12 Jul 2007, Rick Jones wrote: One question is why the RTO gets so large that it limits failover? If Linux TCP is working correctly, RTO should be srtt + 2*rttvar So either there is a huge srtt or variance, or something is going wrong with RTT estimation. Given some reasonable maximums of Srtt = 500ms and rttvar = 250ms, that would cause RTO to be 1second. I suspect that what is happening here is that a link goes down in a trunk somewhere for some number of seconds, resulting in a given TCP segment being retransmitted several times, with the doubling of the RTO each time. But that's a back-off for the retransmissions, the doubling is temporary... Once you return to normal conditions, the accumulated backoff multiplier will be immediately cut back to normal. So you should then be back to 1 second (like in the example or whatever) again... -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] [NET]: Another unnecessary net/tcp.h inclusion in net/dn.h
No longer needed. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/net/dn.h |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/include/net/dn.h b/include/net/dn.h index ac4ce90..6277783 100644 --- a/include/net/dn.h +++ b/include/net/dn.h @@ -3,7 +3,6 @@ #include linux/dn.h #include net/sock.h -#include net/tcp.h #include asm/byteorder.h #define dn_ntohs(x) le16_to_cpu(x) -- 1.5.0.6
[PATCH] [BNX2]: Seems to not need net/tcp.h
Got bored to always recompile it for no reason. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- drivers/net/bnx2.c |1 - 1 files changed, 0 insertions(+), 1 deletions(-) diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c index 88b33c6..56244f6 100644 --- a/drivers/net/bnx2.c +++ b/drivers/net/bnx2.c @@ -40,7 +40,6 @@ #define BCM_VLAN 1 #endif #include net/ip.h -#include net/tcp.h #include net/checksum.h #include linux/workqueue.h #include linux/crc32.h -- 1.5.0.6
Re: [PATCH] [TCP]: Fix logic breakage due to DSACK separation
On Mon, 2 Jul 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Sat, 16 Jun 2007 02:04:25 +0300 (EEST) There are still some things I must think carefully in sacktag processing since it does not validate start_seq and end_seq at all which can be abused currently at least in tcp-2.6. ...I would rather put end to the whole russian roulette in tcp-2.6 sacktag rather than fix/think individual cases and leave future modifications of it similarily hazardous. It's not very clear to me how to handle all possible cases of invalid SACK blocks though, perhaps TCP should just ignore such sack blocks without doing any processing based on them, i.e., ignore them whenever start_seq-end_seq does not fully fit to snd_una-snd_nxt (expect DSACK of course, which should be processed if it's between undo_marker-snd_nxt). Do you have any thoughts about this? I agree. This is a problem that basically every single TCP stack out there right now is vulnerable to, lots of cpu processing for invalid or maliciously created SACK blocks. This is why I even considered the RB-Tree stuff at all. Yes, without RB-tree TCP is quite much out of luck. However, as long as there is depency to skb's fack_count in sacktag, there seems to be some room for cpu processing attacks even with RB-tree case because walk becomes necessary on the slow path. I've thought about three solutions to this, each has some pros cons so I'm not that sure which path should be pursued: 1. fack_count in skb (your proposal months ago) + Trivial arithmetics, no walking necessary to find it (ever) - Very expensive in storage wise (like you have stated earlier) 2. fackets_out removal like I presented earlier (which you seem to have looked by now). It was really RFC and was somewhat in state of flux still + Fast path is fully arithmetics based + Saves little (extra) space in tcp_sock too + Its effect for one directional flow without nagle is quite insignificant I would say (in friendly environment). - But alas, I've noticed that any one directional flow could be forced to bidirectional slow path by sending some dummy data with gaps (in malicious intend), then the sender would use SACKs to inform about those losses whenever data is being send, and if window is large enough TCP would be destined to slow path... :-( How easy it is, i.e., with how small window the sent SACKs exceed MSS size which is the fastpath/slowpath selector depends largely on how large fraction the SACK blocks are in bytes of MSS, with 9000 MTU it's perhaps no longer so easy to do is large scale... Thus it's not anymore sender controllable by nagle setting, like I thought earlier, and therefore it's not that attractive anymore. * malicious tcp_fragment forcing would have to be also closed before this is of any use. It seems to be easy to do by adding WAS_COLLAPSED bit to skb-sacked (has some unused bits) and ignoring middle skb SACKs too. IIRC you were (almost fully) for this already in the past? Haven't checked but it should be quite easy to do, and it is something I'll probably do anyway... 3. seqno based tp-reordering + Trivial to calculate now that there is highest_sack - No longer would the pkt counting be fully pkts based. Not that bad in the case of no Nagle though (which is fully under sender's control :-)). Equal behavior can be estimated by doing rounding to MSS size though (requires some divides though), for no-nagle case it could do exactly the same if I have thought it correctly. - I'd expect some small skb user will be complaining due to performance loss (recovery latency increasing), perhaps we could add sysctl for enabling expensive processing for them. Or just try without it and see what happens, nobody would notice I guess though it will probably surface later on when somebody is first time looking numbers (and is not pleased)... ...And would then of course claim that 2.6.3x (or something later) has a regression against 2.6.(3x-1)... :-) I think that the expensive processing stuff can reuse some work I've already done for the alternative 2 earlier to avoid walk-always in case of a flow with normal usage pattern on the same host. ...I'd say that alternative 3 is most attractive of these... But due to small skb recovery affecting change I'm not too sure if it's desired? With RB-tree things get really interesting since it seems that TCP has some chance... :-) There are many things still to handle but it's getting closer already after SACK validation is in tcp-2.6. One thing I would be happy avoid is the full walk that is necessary when SACK reneging is detected... It is already much harder to do in tcp-2.6 and at least the amount of work decreases compared to the last reneging (if it was in the same
Re: [PATCH 8/9] [TCP]: Reduce sacked_out with reno when purging write_queue
On Mon, 2 Jul 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Sat, 26 May 2007 11:36:01 +0300 Previously TCP had a transitional state during which reno counted segments that are already below the current window into sacked_out, which is now prevented. Re-try now unconditional S+L catching (I wonder if we could get that BUG_ON place pinpointed more exactly in oops because now inlining makes it lose its original context as they always seem to be in tcp_ack, #define helps?). Beware, this change is not a trivial one and might have some unexpected side-effects under obscure conditions since state tracking is to happen much later on and the reno sack counting was highly depending on the current state. This approach conservatively calls just remove_sack and leaves reset_sack() calls alone. The best solution to the whole problem would be to first calculate the new sacked_out fully (this patch does not move reno_sack_reset calls from original sites and thus does not implement this). However, that would require very invasive change to fastretrans_alert (perhaps even slicing it to two halves). Alternatively, all callers of tcp_packets_in_flight (i.e., users that depend on sacked_out) should be postponed until the new sacked_out has been calculated but it isn't any simpler alternative. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] So basically the idea behind this patch is to do the update of the fake RENO sakcs in clean_rtx_queue instead of fixing it up at the very last moment right before we invoke tcp_try_to_undo_partial(). Yeap, it would change sacked_out that things are seeing before undo_partial point (if there would be some) but it would qualify IMHO as fix for those case too rather than bug. I checked that F-RTO (with reno) cannot ever access stale sacked_out because of other constraints based on ACK's snd_una, and that's pretty much what's being done between there. TCP might still do another update later on by using tcp_reset_reno_sack, but that's always going to more aggressive update (or at least an equal one). I like this patch and I can't find any holes in the idea. But some things have changed in the meantime and this patch (and probably 9/9 too) don't apply cleanly. Could you respin these against current tcp-2.6 so I can apply them? I knew that :-), it was postponed due to other, more important issues and because the trees then got some depencies at that point I decided it's better to wait for a while until tcp-2.6 gets all stuff that's to be in net-2.6... But now that things seem to have settled, I'll provide the updated one later on. The 9/9 should be dropped, I've noticed a cpu processing vulnerability in it which I previously didn't see there (I describe it in my other post). -- i.
Re: [RFC PATCH tcp-2.6 1/2] [TCP]: Discard fuzzy SACK blocks
On Mon, 18 Jun 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Tue, 19 Jun 2007 01:25:57 +0300 From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] SACK processing code has been sort of russian roulette as no validation of SACK blocks is previously attempted. It is not very clear what all kinds of broken SACK blocks really mean (e.g., one that has start and end sequence numbers reversed). This fixes also one remote triggerable NULL-ptr access: start_seq was trusted as a valid mark_lost_entry_seq but without validation it is relatively easy to inject an invalid sequence number there that will cause a crash in the lost marker code. Other SACK processing code seems safe so this could have been fixed locally but that would leave SACK processing hazardous in case of future modifications. IMHO, it's just better to close the whole roulette rather than disarming just one bullet. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] This looks good applied. Does mainline 2.6.x has this NULL-ptr issues too? If so we'll have to fix it there very soon. The null-ptr issue is related to the new lost marker code which isn't in the mainline (I have even tried to state this couple of times in the earlier posts so that you could have less worries as I understand that it's your highest priority concern :-)). I.e, the mainline lost marker code scans queue by using tcp_for_write_queue() or starts from the hint it has learned earlier, it doesn't have to trust any sequences that are given by the peer in SACK block. I'll try to explain this once again: Because the new lost marker fastpath blindly trusted start_seq from received SACK, it could be abused to search for a sequence number for which tcp_write_queue_find returns NULL (happens when the searched sequence is outside of current window). Another trouble would nowadays occur if start_seq == snd_una because write_queue_find is now given start_seq - 1 (which is fully intentional, no need to find skb at start_seq but one below it). I disallowed SACK blocks starting from snd_una partly due to that, and partly due to it's non-sense interpretation. It would not end up to the lost marker though because sack reneging detection intervenes. Obviously I would have alerted you and security folks if there would have been this problem in mainline stable too (and probably had done a valid patch to those trees at the first time). In case you still want to verify mainline by yourself, you can see where start_seq and end_seq goes in mainline's sacktag (not very hard to see :-)). Like I've explained earlier, only serious suspect seems to be the calculation of pkt_len that is input to tcp_fragment. Other cases are relatively trivial. However, analytical analysis of pkt_len setting took me considerable amount of time due to vast number of negations which were able to defeat my brains at least partly. Therefore I really wouldn't mind if also you verify that pkt_len never becomes either zero or exceeds skb-len (even equal to skb-len is not very nice). Things I tried to consider are (you might find some additional thing to consider besides these): - outside snd_una-snd_nxt start/end_seq - start/end_seq equal - start/end_seq reversing - 2^31 wrap problems - after/before ambiguity (discussed some time ago on netdev) I couldn't find a problem causing case because the successive before() and after() jails were so tight. Neither did my limited bruteforcer succeed (as you can see, I wasn't that convinced about by my analysis validity)... For sure you're curious enough - have a nice day with the negations... ;-) ...Seriously, double verification of that pkt_len part wouldn't hurt considering it's complexity. -- i.
[RFC PATCH tcp-2.6 1/2] [TCP]: Discard fuzzy SACK blocks
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] SACK processing code has been sort of russian roulette as no validation of SACK blocks is previously attempted. It is not very clear what all kinds of broken SACK blocks really mean (e.g., one that has start and end sequence numbers reversed). This fixes also one remote triggerable NULL-ptr access: start_seq was trusted as a valid mark_lost_entry_seq but without validation it is relatively easy to inject an invalid sequence number there that will cause a crash in the lost marker code. Other SACK processing code seems safe so this could have been fixed locally but that would leave SACK processing hazardous in case of future modifications. IMHO, it's just better to close the whole roulette rather than disarming just one bullet. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 40 1 files changed, 40 insertions(+), 0 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4c8882e..ebd9739 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -977,6 +977,27 @@ static void tcp_update_reordering(struct sock *sk, const int metric, * so that we are allowed not to be bothered by order of our actions, * when multiple events arrive simultaneously. (see the function below). * + * SACK block validation. + * -- + * + * SACK block range validation checks that the received SACK block fits to + * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. + * Note that SND.UNA is not included to the range though being valid because + * it means that the receiver is rather inconsistent with itself (reports + * SACK reneging when it should advance SND.UNA). + * + * With D-SACK the lower bound is extended to cover sequence space below + * SND.UNA down to undo_marker, which is the last point of interest. But + * there all simplicity ends, TCP might receive valid D-SACKs below that. + * As long as they reside fully below undo_marker they do not affect + * behavior in anyway and can therefore be safely ignored. In rare cases + * (which are really theoretical ones), the D-SACK will nicely cross that + * boundary due to skb fragmentation and packet reordering past skb's + * retransmission. To consider them correctly, the acceptable range must + * be extended even more though the exact amount is rather hard to + * quantify (from the state TCP has?), so in boundary crossing case + * simply adjust start_seq to undo_marker. + * * Reordering detection. * * Reordering metric is maximal distance, which a packet can be displaced @@ -1243,9 +1264,28 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, struct sk_buff *skb; __u32 start_seq = ntohl(sp-start_seq); __u32 end_seq = ntohl(sp-end_seq); + u32 valid_range_start; int fack_count; state.dup_sack = (found_dup_sack (i == state.first_sack_index)); + /* SACK validation: start_seq == snd_una is non-sensical */ + valid_range_start = tp-snd_una + 1; + + if (state.dup_sack tp-undo_marker) { + valid_range_start = tp-undo_marker; + + /* Undo_marker boundary crossing, see comments above */ + if (before(start_seq, tp-undo_marker) + after(end_seq, tp-undo_marker)) + start_seq = tp-undo_marker; + } + + /* Discard SACK blocks that are seemingly bad */ + if (before(start_seq, valid_range_start) || + after(end_seq, tp-snd_nxt) || + !before(start_seq, end_seq)) + continue; + skb = cached_skb; fack_count = cached_fack_count; -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH tcp-2.6 0/2]: SACK block validation
Hi, SACK block validation to tcp-2.6 tree. Before preparing this, I rebased tcp-2.6 to net-2.6 (not 2.6.23) because of DSACK patch in net-2.6 (some conflicts will occur). I did some very basic testing with this. No TCPSACKDiscards occured in it but it's kind of hard for me to fully test the whole internet without being a server end for most of it, maybe I should start abusing all port probers for that purpose by using them as sinks... :-) This patchset addresses the only issue I found while trying to address your earlier comments (found in here): http://marc.info/?l=linux-netdevm=118060110632768w=2 It's perfectly ok for timedout_continue to be NULL and it's being checked for except when the result of find for tp-highest_sack is accessed (that will be a valid skb because sacked_out 0). Actually I didn't fully understand what was the concern with timedout_continue rather than with skb... But this problem was located anyway, so the comment itself proved useful (and DSACK regression got fixed too due to it :-)). If they it looks sane, you can consider applying to tcp-2.6 (rebasing is needed for that). I'm not entirely sure if I can add MIB stuff just like that (couldn't find any examples from the available history) though some info will definately be useful when trying to figure out regression reports (due to flawed TCP implementation of the peer). Yes, it might cause some troubles when communicating with broken TCP implementations. Do you have some plans regarding tcp-2.6? FYI, I've also discovered that the current implementation of timedout_mark_forward is flawed because tcp_skb_timedout() boundary is advanced whenever retransmissions there occur but I probably don't have time to address that until mid-July or so. I think that the hint is not coming back though as TCP can use a binary search that finds TCBCB_TAGBITS boundary using the RB-tree. My other plans include, e.g., combining highest_sack(+fackets_out) and sack fastpath hints but that requires some benchmarking first to see how frequently they are different, I suspect it won't really happen that often, so most of the time we have three u32s and one skb ptr pointing the very same place (fackets stuff is not really pointing but related like you said earlier)... :-) -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[RFC PATCH tcp-2.6 2/2] [TCP] MIB: Add counter for discarded SACK blocks
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/linux/snmp.h |1 + net/ipv4/proc.c |1 + net/ipv4/tcp_input.c |4 +++- 3 files changed, 5 insertions(+), 1 deletions(-) diff --git a/include/linux/snmp.h b/include/linux/snmp.h index 802b3a3..6e64a70 100644 --- a/include/linux/snmp.h +++ b/include/linux/snmp.h @@ -231,6 +231,7 @@ enum LINUX_MIB_TCPABORTONLINGER, /* TCPAbortOnLinger */ LINUX_MIB_TCPABORTFAILED, /* TCPAbortFailed */ LINUX_MIB_TCPMEMORYPRESSURES, /* TCPMemoryPressures */ + LINUX_MIB_TCPSACKDISCARD, /* TCPSACKDiscard */ __LINUX_MIB_MAX }; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3b690cf..d5a8de0 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -244,6 +244,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM(TCPAbortOnLinger, LINUX_MIB_TCPABORTONLINGER), SNMP_MIB_ITEM(TCPAbortFailed, LINUX_MIB_TCPABORTFAILED), SNMP_MIB_ITEM(TCPMemoryPressures, LINUX_MIB_TCPMEMORYPRESSURES), + SNMP_MIB_ITEM(TCPSACKDiscard, LINUX_MIB_TCPSACKDISCARD), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ebd9739..e26d8ca 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1283,8 +1283,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, /* Discard SACK blocks that are seemingly bad */ if (before(start_seq, valid_range_start) || after(end_seq, tp-snd_nxt) || - !before(start_seq, end_seq)) + !before(start_seq, end_seq)) { + NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); continue; + } skb = cached_skb; fack_count = cached_fack_count; -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] [TCP]: Fix logic breakage due to DSACK separation
On Fri, 15 Jun 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Sat, 16 Jun 2007 02:04:25 +0300 (EEST) On Fri, 15 Jun 2007, David Miller wrote: Patch applied. ...I think it should go to stable as well. I thought Baruch's DSACK seperation that created this problem didn't go in until 2.6.22? Then my gitk lies... :-) It claims for the particular commit: Follows: v2.6.20 Preceeds: v2.6.21-rc1 $ git-describe 6f74651ae626ec672028587bc700538076dfbefb v2.6.20-835-g6f74651 ...It's definately in 2.6.21 already. -- i.
[PATCH net-2.6] [TCP]: Congestion control API RTT sampling fix
Commit 164891aadf1721fca4dce473bb0e0998181537c6 broke RTT sampling of congestion control modules. Inaccurate timestamps could be fed to them without providing any way for them to identify such cases. Previously RTT sampler was called only if FLAG_RETRANS_DATA_ACKED was not set filtering inaccurate timestamps nicely. In addition, the new behavior could give an invalid timestamp (zero) to RTT sampler if only skbs with TCPCB_RETRANS were ACKed. This solves both problems. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/linux/ktime.h | 12 include/linux/skbuff.h |4 net/ipv4/tcp_illinois.c |3 +++ net/ipv4/tcp_input.c|6 +- net/ipv4/tcp_lp.c |3 ++- net/ipv4/tcp_vegas.c|3 +++ net/ipv4/tcp_veno.c |3 +++ 7 files changed, 32 insertions(+), 2 deletions(-) diff --git a/include/linux/ktime.h b/include/linux/ktime.h index c762954..2b139f6 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -261,6 +261,18 @@ static inline s64 ktime_to_ns(const ktime_t kt) #endif +/** + * ktime_equal - Compares two ktime_t variables to see if they are equal + * @cmp1: comparable1 + * @cmp2: comparable2 + * + * Compare two ktime_t variables, returns 1 if equal + */ +static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2) +{ + return cmp1.tv64 == cmp2.tv64; +} + static inline s64 ktime_to_us(const ktime_t kt) { struct timeval tv = ktime_to_timeval(kt); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e7367c7..6f0b2f7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1579,6 +1579,10 @@ static inline ktime_t net_timedelta(ktime_t t) return ktime_sub(ktime_get_real(), t); } +static inline ktime_t net_invalid_timestamp(void) +{ + return ktime_set(0, 0); +} extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len); extern __sum16 __skb_checksum_complete(struct sk_buff *skb); diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 4adc47c..b2b2256 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -90,6 +90,9 @@ static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, ktime_t last) ca-acked = pkts_acked; + if (ktime_equal(last, net_invalid_timestamp())) + return; + rtt = ktime_to_us(net_timedelta(last)); /* ignore bogus values, this prevents wraparound in alpha math */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6d0f9b..aaf6f66 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2409,7 +2409,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) int acked = 0; int prior_packets = tp-packets_out; __s32 seq_rtt = -1; - ktime_t last_ackt = ktime_set(0,0); + ktime_t last_ackt = net_invalid_timestamp(); while ((skb = tcp_write_queue_head(sk)) skb != tcp_send_head(sk)) { @@ -2487,6 +2487,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk); + /* Is the ACK triggering packet unambiguous? */ + if (acked FLAG_RETRANS_DATA_ACKED) + last_ackt = net_invalid_timestamp(); + if (ca_ops-pkts_acked) ca_ops-pkts_acked(sk, pkts_acked, last_ackt); } diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 43294ad..e49836c 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -266,7 +266,8 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, ktime_t last) struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); - tcp_lp_rtt_sample(sk, ktime_to_us(net_timedelta(last))); + if (!ktime_equal(last, net_invalid_timestamp())) + tcp_lp_rtt_sample(sk, ktime_to_us(net_timedelta(last))); /* calc inference */ if (tcp_time_stamp tp-rx_opt.rcv_tsecr) diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 73e19cf..e218a51 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -117,6 +117,9 @@ void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) struct vegas *vegas = inet_csk_ca(sk); u32 vrtt; + if (ktime_equal(last, net_invalid_timestamp())) + return; + /* Never allow zero rtt or baseRTT */ vrtt = ktime_to_us(net_timedelta(last)) + 1; diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c index 9edb340..ec854cc 100644 --- a/net/ipv4/tcp_veno.c +++ b/net/ipv4/tcp_veno.c @@ -74,6 +74,9 @@ static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) struct veno *veno = inet_csk_ca(sk); u32 vrtt; + if (ktime_equal(last, net_invalid_timestamp())) + return; + /* Never allow zero rtt or baseRTT */ vrtt = ktime_to_us
[PATCH tcp-2.6 0/3]: cleanups
Couple of relatively simple cleanups to tcp-2.6. -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH tcp-2.6 1/3] [TCP]: Add tcp_dec_pcount_approx int variant
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/net/tcp.h | 11 --- net/ipv4/tcp_output.c |8 ++-- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 6f88d13..d12154a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -593,16 +593,21 @@ static inline int tcp_skb_mss(const struct sk_buff *skb) return skb_shinfo(skb)-gso_size; } -static inline void tcp_dec_pcount_approx(__u32 *count, -const struct sk_buff *skb) +static inline void tcp_dec_pcount_approx_int(__u32 *count, const int decr) { if (*count) { - *count -= tcp_skb_pcount(skb); + *count -= decr; if ((int)*count 0) *count = 0; } } +static inline void tcp_dec_pcount_approx(__u32 *count, +const struct sk_buff *skb) +{ + tcp_dec_pcount_approx_int(count, tcp_skb_pcount(skb)); +} + static inline void tcp_packets_out_inc(struct sock *sk, const struct sk_buff *skb) { diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 23ee283..c8be1c8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -738,15 +738,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss if (diff 0) { /* Adjust Reno SACK estimate. */ if (!tp-rx_opt.sack_ok) { - tp-sacked_out -= diff; - if ((int)tp-sacked_out 0) - tp-sacked_out = 0; + tcp_dec_pcount_approx_int(tp-sacked_out, diff); tcp_sync_left_out(tp); } - tp-fackets_out -= diff; - if ((int)tp-fackets_out 0) - tp-fackets_out = 0; + tcp_dec_pcount_approx_int(tp-fackets_out, diff); } } -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH tcp-2.6 2/3] [TCP]: Remove num_acked0 checks from cong.ctrl mods pkts_acked
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] There is no need for such check in pkts_acked because the callback is not invoked unless at least one segment got fully ACKed (i.e., the snd_una moved past skb's end_seq) by the cumulative ACK's snd_una advancement. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_bic.c |2 +- net/ipv4/tcp_cubic.c|2 +- net/ipv4/tcp_westwood.c |3 +-- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 281c9f9..9ebd8ec 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -210,7 +210,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (cnt 0 icsk-icsk_ca_state == TCP_CA_Open) { + if (icsk-icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); cnt -= ca-delayed_ack ACK_RATIO_SHIFT; ca-delayed_ack += cnt; diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 1422448..8be8ac4 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -338,7 +338,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t last) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (cnt 0 icsk-icsk_ca_state == TCP_CA_Open) { + if (icsk-icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); cnt -= ca-delayed_ack ACK_RATIO_SHIFT; ca-delayed_ack += cnt; diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index e61e09d..0d56a5a 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -103,8 +103,7 @@ static void westwood_filter(struct westwood *w, u32 delta) static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, ktime_t last) { struct westwood *w = inet_csk_ca(sk); - if (cnt 0) - w-rtt = tcp_sk(sk)-srtt 3; + w-rtt = tcp_sk(sk)-srtt 3; } /* -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH tcp-2.6 3/3] [TCP]: Move sack_ok access to obviously named funcs cleanup
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] Previously code had IsReno/IsFack defined as macros that were local to tcp_input.c though sack_ok field has users elsewhere too for the same purpose. This changes them to static inlines as preferred according the current coding style and unifies the access to sack_ok across multiple files. Magic bitops of sack_ok for FACK and DSACK are also abstracted to functions with appropriate names. Note: - One sack_ok = 1 remains but that's self explanary, i.e., it enables sack - Couple of !IsReno cases are changed to tcp_is_sack - There were no users for IsDSack = I dropped it Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/net/tcp.h| 28 net/ipv4/tcp.c |2 +- net/ipv4/tcp_input.c | 103 +- net/ipv4/tcp_minisocks.c |2 +- net/ipv4/tcp_output.c|6 +- net/ipv4/tcp_timer.c |2 +- 6 files changed, 90 insertions(+), 53 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index d12154a..dd9749e 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -715,6 +715,34 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) icsk-icsk_ca_ops-cwnd_event(sk, event); } +/* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. + * + * tcp_is_sack - SACK enabled + * tcp_is_reno - No SACK + * tcp_is_fack - FACK enabled, implies SACK enabled + */ +static inline int tcp_is_sack(const struct tcp_sock *tp) +{ + return tp-rx_opt.sack_ok; +} + +static inline int tcp_is_reno(const struct tcp_sock *tp) +{ + return !tcp_is_sack(tp); +} + +static inline int tcp_is_fack(const struct tcp_sock *tp) +{ + return tp-rx_opt.sack_ok 2; +} + +static inline void tcp_enable_fack(struct tcp_sock *tp) +{ + tp-rx_opt.sack_ok |= 2; +} + /* This determines how many packets are in the network to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bd4c295..e3de919 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2000,7 +2000,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp-rx_opt.tstamp_ok) info-tcpi_options |= TCPI_OPT_TIMESTAMPS; - if (tp-rx_opt.sack_ok) + if (tcp_is_sack(tp)) info-tcpi_options |= TCPI_OPT_SACK; if (tp-rx_opt.wscale_ok) { info-tcpi_options |= TCPI_OPT_WSCALE; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f3942c3..3efd4f5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -108,11 +108,6 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) -#define IsReno(tp) ((tp)-rx_opt.sack_ok == 0) -#define IsFack(tp) ((tp)-rx_opt.sack_ok 2) -#define IsDSack(tp) ((tp)-rx_opt.sack_ok 4) -#define Is3517Sack(tp) (!IsReno(tp) !IsFack(tp)) - #define IsSackFrto() (sysctl_tcp_frto == 0x2) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) @@ -840,6 +835,21 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) } } +/* + * Packet counting of FACK is based on in-order assumptions, therefore TCP + * disables it when reordering is detected + */ +static void tcp_disable_fack(struct tcp_sock *tp) +{ + tp-rx_opt.sack_ok = ~2; +} + +/* Take a notice that peer is sending DSACKs */ +static void tcp_dsack_seen(struct tcp_sock *tp) +{ + tp-rx_opt.sack_ok |= 4; +} + /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) @@ -861,7 +871,7 @@ static void tcp_init_metrics(struct sock *sk) } if (dst_metric(dst, RTAX_REORDERING) tp-reordering != dst_metric(dst, RTAX_REORDERING)) { - tp-rx_opt.sack_ok = ~2; + tcp_disable_fack(tp); tp-reordering = dst_metric(dst, RTAX_REORDERING); } @@ -923,9 +933,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, /* This exciting event is worth to be remembered. 8) */ if (ts) NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); - else if (IsReno(tp)) + else if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); - else if (IsFack(tp)) + else if (tcp_is_fack(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); @@ -937,8 +947,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, tp-sacked_out
[PATCH] [TCP]: Fix logic breakage due to DSACK separation
Commit 6f74651ae626ec672028587bc700538076dfbefb is found guilty of breaking DSACK counting, which should be done only for the SACK block reported by the DSACK instead of every SACK block that is received along with DSACK information. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c |9 + 1 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d6d0f9b..c6a363b 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -953,7 +953,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int prior_fackets; u32 lost_retrans = 0; int flag = 0; - int dup_sack = 0; + int found_dup_sack = 0; int cached_fack_count; int i; int first_sack_index; @@ -964,20 +964,20 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ /* Check for D-SACK. */ if (before(ntohl(sp[0].start_seq), TCP_SKB_CB(ack_skb)-ack_seq)) { - dup_sack = 1; + found_dup_sack = 1; tp-rx_opt.sack_ok |= 4; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); } else if (num_sacks 1 !after(ntohl(sp[0].end_seq), ntohl(sp[1].end_seq)) !before(ntohl(sp[0].start_seq), ntohl(sp[1].start_seq))) { - dup_sack = 1; + found_dup_sack = 1; tp-rx_opt.sack_ok |= 4; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); } /* D-SACK for already forgotten data... * Do dumb counting. */ - if (dup_sack + if (found_dup_sack !after(ntohl(sp[0].end_seq), prior_snd_una) after(ntohl(sp[0].end_seq), tp-undo_marker)) tp-undo_retrans--; @@ -1058,6 +1058,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ __u32 start_seq = ntohl(sp-start_seq); __u32 end_seq = ntohl(sp-end_seq); int fack_count; + int dup_sack = (found_dup_sack (i == first_sack_index)); skb = cached_skb; fack_count = cached_fack_count; -- 1.5.0.6
Re: [PATCH] [TCP]: Fix logic breakage due to DSACK separation
On Fri, 15 Jun 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Fri, 15 Jun 2007 16:07:37 +0300 (EEST) Commit 6f74651ae626ec672028587bc700538076dfbefb is found guilty of breaking DSACK counting, which should be done only for the SACK block reported by the DSACK instead of every SACK block that is received along with DSACK information. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] Thanks for performing such rigorous audits and finding regressions like this! They just come across, one thing leads to another, which again leads to another and so on... ...I was just trying to address the concerns you noted about tcp-2.6 patchset a while ago and came across other issues... There are still some things I must think carefully in sacktag processing since it does not validate start_seq and end_seq at all which can be abused currently at least in tcp-2.6. ...I would rather put end to the whole russian roulette in tcp-2.6 sacktag rather than fix/think individual cases and leave future modifications of it similarily hazardous. It's not very clear to me how to handle all possible cases of invalid SACK blocks though, perhaps TCP should just ignore such sack blocks without doing any processing based on them, i.e., ignore them whenever start_seq-end_seq does not fully fit to snd_una-snd_nxt (expect DSACK of course, which should be processed if it's between undo_marker-snd_nxt). Do you have any thoughts about this? It seems to me that net-2.6 is safe in this respect (probably just by accident) but the analysis wasn't that trivial, my main concern was tcp_fragment's pkt_len argument, if it ever becomes skb-len, a boom would result (and perhaps zero has similar issues)! I couldn't find breakage analytically (but I could be wrong in it due to mind-boggling number of negations :-)) nor by bruteforcing seqno combinations near 0, skb-len and 2^31 wrap. Patch applied. ...I think it should go to stable as well. -- i.
[PATCH] [TCP]: Add missing break to TCP option parsing code
This flaw does not affect any behavior (currently). Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c |1 + 1 files changed, 1 insertions(+), 0 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index d506bdc..aaf6f66 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2936,6 +2936,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, opt_rx-sack_ok) { TCP_SKB_CB(skb)-sacked = (ptr - 2) - (unsigned char *)th; } + break; #ifdef CONFIG_TCP_MD5SIG case TCPOPT_MD5SIG: /* -- 1.5.0.6
[PATCH net-2.6 1/1] [TCP]: Fix left_out setting during FRTO
Without FRTO, the tcp_try_to_open is never called with lost_out 0 (see tcp_time_to_recover). However, when FRTO is enabled, the !tp-lost condition is not used until end of FRTO because that way TCP avoids premature entry to fast recovery during FRTO. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- This case was found during left_out drop audit (only relevant to net-2.6 since tcp-2.6 does a right thing after left_out drop). diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 74683d8..ed4a1bd 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2037,7 +2037,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - tp-left_out = tp-sacked_out; + tcp_sync_left_out(tp); if (tp-retrans_out == 0) tp-retrans_stamp = 0; -- 1.5.0.6
[RFC PATCH net-2.6] [TCP]: Congestion control API RTT sampling fix
I was thinking something like this to fix the cc module breakage introduced by the API change (haven't tested it besides compile): [RFC PATCH net-2.6] [TCP]: Congestion control API RTT sampling fix Commit 164891aadf1721fca4dce473bb0e0998181537c6 broke RTT sampling of congestion control modules. Inaccurate timestamps could be fed to them without providing any way for them to identify such cases. Previously RTT sampler was called only if FLAG_RETRANS_DATA_ACKED was not set filtering inaccurate timestamps nicely. In addition, the new behavior could give an invalid timestamp (zero) to RTT sampler if only skbs with TCPCB_RETRANS were ACKed. This solves both problems. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/linux/ktime.h | 18 ++ include/linux/skbuff.h |4 net/ipv4/tcp_illinois.c |3 +++ net/ipv4/tcp_input.c|6 +- net/ipv4/tcp_lp.c |3 ++- net/ipv4/tcp_vegas.c|3 +++ net/ipv4/tcp_veno.c |3 +++ 7 files changed, 38 insertions(+), 2 deletions(-) diff --git a/include/linux/ktime.h b/include/linux/ktime.h index c762954..9f7fa3e 100644 --- a/include/linux/ktime.h +++ b/include/linux/ktime.h @@ -102,6 +102,12 @@ static inline ktime_t ktime_set(const long secs, const unsigned long nsecs) #define ktime_add_ns(kt, nsval) \ ({ (ktime_t){ .tv64 = (kt).tv64 + (nsval) }; }) +/* Compare two ktime_t variables, returns 1 if equal */ +static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2) +{ + return cmp1.tv64 == cmp2.tv64; +} + /* convert a timespec to ktime_t format: */ static inline ktime_t timespec_to_ktime(struct timespec ts) { @@ -200,6 +206,18 @@ static inline ktime_t ktime_add(const ktime_t add1, const ktime_t add2) extern ktime_t ktime_add_ns(const ktime_t kt, u64 nsec); /** + * ktime_equal - Compares two ktime_t variables to see if they are equal + * @cmp1: comparable1 + * @cmp2: comparable2 + * + * Compare two ktime_t variables, returns 1 if equal + */ +static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2) +{ + return !((cmp1.tv.sec ^ cmp2.tv.sec) | (cmp1.tv.usec ^ cmp2.tv.usec)); +} + +/** * timespec_to_ktime - convert a timespec to ktime_t format * @ts:the timespec variable to convert * diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e7367c7..6f0b2f7 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1579,6 +1579,10 @@ static inline ktime_t net_timedelta(ktime_t t) return ktime_sub(ktime_get_real(), t); } +static inline ktime_t net_invalid_timestamp(void) +{ + return ktime_set(0, 0); +} extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len); extern __sum16 __skb_checksum_complete(struct sk_buff *skb); diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c index 4adc47c..5f8d01b 100644 --- a/net/ipv4/tcp_illinois.c +++ b/net/ipv4/tcp_illinois.c @@ -90,6 +90,9 @@ static void tcp_illinois_acked(struct sock *sk, u32 pkts_acked, ktime_t last) ca-acked = pkts_acked; + if (ktime_equal(last, net_invalid_timestamp()) + return; + rtt = ktime_to_us(net_timedelta(last)); /* ignore bogus values, this prevents wraparound in alpha math */ diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index ed4a1bd..d506bdc 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2409,7 +2409,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) int acked = 0; int prior_packets = tp-packets_out; __s32 seq_rtt = -1; - ktime_t last_ackt = ktime_set(0,0); + ktime_t last_ackt = net_invalid_timestamp(); while ((skb = tcp_write_queue_head(sk)) skb != tcp_send_head(sk)) { @@ -2487,6 +2487,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk); + /* Is the ACK triggering packet unambiguous? */ + if (acked FLAG_RETRANS_DATA_ACKED) + last_ackt = net_invalid_timestamp(); + if (ca_ops-pkts_acked) ca_ops-pkts_acked(sk, pkts_acked, last_ackt); } diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c index 43294ad..efa358b 100644 --- a/net/ipv4/tcp_lp.c +++ b/net/ipv4/tcp_lp.c @@ -266,7 +266,8 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 num_acked, ktime_t last) struct tcp_sock *tp = tcp_sk(sk); struct lp *lp = inet_csk_ca(sk); - tcp_lp_rtt_sample(sk, ktime_to_us(net_timedelta(last))); + if (!ktime_equal(last, net_invalid_timestamp()) + tcp_lp_rtt_sample(sk, ktime_to_us(net_timedelta(last))); /* calc inference */ if (tcp_time_stamp tp-rx_opt.rcv_tsecr) diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 73e19cf..bd7a08f 100644 --- a/net/ipv4
Re: Suspicious fackets_out handling
On Thu, 31 May 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Mon, 23 Apr 2007 14:28:21 +0300 (EEST) There are IMHO two problems in it. First of all, nothing ensures that the skb TCP is fragmenting is actually below the forwardmost sack block (and thus is included to the fackets_out)... ...this is so old thing already that I had to refresh my memory, haven't been expecting any answer for ages... :-) Good catch, I agree with your analysis completely. ...I later on understood (from a comment I found elsewhere) that fackets_out is occasionally estimated, rather than exact value (which could be completely fixed in tcp-2.6 due to presence of highest_sack but I'm also considering complete removal of it too, like you probably noticed). What I'm not sure of though, is how to fix this in net-2.6(.22), it is due to the fact that there is no pointer/seq which can be used in testing for it like in tcp-2.6 which has the highest_sack. We can add highest_sack to 2.6.22 in order to fix a bug like this, if necessary. ...I think we can leave it as estimate for now, it has been like that for years and nobody has complained... :-) This problem covers really marginal number of cases anyway I think (isn't the diff usually negative: old - new after fragment should be negative unless mss_now bites). After I found the comments and analyzed some sacked_out code with NewReno, I think that usually these estimates are indicated in the code with tcp_dec_pcount_approx(...) but it wasn't used in this specific case because it inputs skb instead of int, I'll add clarification of this into my tcp-2.6 todo list... Second problem is even more obvious: if adjustment here is being done and the sacktag code then uses fastpath at the arrival of the next ACK, the sacktag code will use a stale value from fastpath_cnt_hint and fails to notice all that math TCP did here unless we start clearing fastpath_skb_hint. Let's try not to clear fastpath_skb_hint here if possible :-) Np, however, whatever we do, it wouldn't really execute that often... :-) Is it possible to update fastpath_cnt_hint properly perhaps? I think that would be valid and even accurate as it can checks skb's seq against fastpath_skb_hint-seq. I'm in a hurry and will be a week out of reach of internet connectivity but here's quick attempt to deal with this cleanly. Compile tested (be extra careful with this one, it's done i a hurry :-)), consider to net-2.6. Considering the marginality of this issue, stable might really be an overkill for this one, only a remotely valid concern comes to my mind in this case: possibility of fackets_out packet_out might not be dealt that cleanly everywhere (but I'm sure that you can come to a good decicion about it): [PATCH] [TCP]: SACK fastpath did override adjusted fackets_out Do same adjustment to SACK fastpath counters provided that they're valid. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_output.c |9 + 1 files changed, 9 insertions(+), 0 deletions(-) diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 53232dd..f24dd36 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -699,6 +699,15 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss tp-fackets_out -= diff; if ((int)tp-fackets_out 0) tp-fackets_out = 0; + /* SACK fastpath might overwrite it unless dealt with */ + if (tp-fastpath_skb_hint != NULL + after(TCP_SKB_CB(tp-fastpath_skb_hint)-seq, + TCP_SKB_CB(skb)-seq)) { + tp-fastpath_cnt_hint -= diff; + if ((int)tp-fastpath_cnt_hint 0) + tp-fastpath_cnt_hint = 0; + + } } } -- 1.5.0.6
Re: [PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules)
On Thu, 31 May 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Wed, 30 May 2007 12:10:06 +0300 (EEST) [PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules) The code used to ignore GSO completely, passing either way too small or zero pkts_acked when GSO skb or part of it got ACKed. In addition, there is no need to calculate the value in the loop but simple arithmetics after the loop is sufficient. There is no need to handle SYN case specially because congestion control modules are not yet initialized when FLAG_SYN_ACKED is set. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] Patch applied, thanks a lot! ...It's good to be on the alert with this one because it might uncover some hidden bugs from congestion control module code (it changes some behavior so radically), which obviously will have to be then fixed too... -- i.
Re: [PATCH 3/9] [TCP]: Tighten tcp_sock's belt, drop left_out
Thanks for you comments... On Thu, 31 May 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Sat, 26 May 2007 11:35:56 +0300 It is easily calculable when needed and user are not that many after all. This looks good, but are you absolutely sure we never used a stale value of tp-left_out on purpose? I tried to audit this but there are so many cases :-) Yes I know... So often complexity in these kind of things really perplexes me too, in a way this is somewhat similar to the case with FRTO and no SACKed bits touched (which just looked very obvious and safe) but would have caused the dreaded S+L situation unless special handling for IsReno(tp) case would have been added. What I mean here is, was there a case where we updated sacked_out or lost_out, but then used left_out before it was updated, and we were doing this on purpose? I understand what you mean even though I think that such approach would be very fragile if being done (and should be fixed). I was actually wondering earlier that could it be the only reason for the separate variable in the first place. Though so far I don't know any cases... ...but the code ain't always that trivial like you say :-), however, this after proven safe, will make it cleaner and will save next one to come from the same wondering... ;-) Anyway, I'll be try to do some grep tricks in order to reduce the number of cases that need an expensive manual audit to a smaller set where tcp_sync_left_out is not right after the adjustment. As plenty of functions end to tcp_sync_left_out without returning in between, it could help a lot, of course the calls within that function could have issues too but they're relatively easy to check then (common pattern is a trivial loop + sync as you know)... Usually only when the stale state leaks from a function, we're in troubles with the number of possible paths. ...I'll come back to this later on after verifying it and perhaps provide the same shortened list for you too so that you can see if I overlooked something... It might actually end up being easier to audit all tcp_packets_in_flight callers instead to see if there's always sync_left_out before them. Another thing, I was thinking of renaming tcp_sync_left_out to tcp_verify_left_out now that it does only BUG traps... Do you have a hunch about this (from patch 8 msg): I wonder if we could get that BUG_ON place pinpointed more exactly in oops because now inlining makes it lose its original context as they always seem to be in tcp_ack, #define helps? (i.e, how does it show up in oops if tcp_verify_left_out looks like this): #define tcp_verify_left_out(tp) BUG_ON(...) ...does it still point to the tcp.h line then or to the tcp_sync_left_out(tp) @ line in tcp_input.c? ...I'll test that later on by myself in case you don't know the answer. -- i.
Re: [PATCH-2.4] Fix divide by 0 in vegas_cong_avoid()
On Tue, 29 May 2007, Stephen Hemminger wrote: On Tue, 29 May 2007 20:23:45 +0200 Lior Dotan [EMAIL PROTECTED] wrote: NTP was not running. I'm not sure what do you mean by fixing the -1. The trace shows that vegas_cong_avoid() is called with -1, and the only way it can happen is from tcp_clean_rtx_queue() and the patch should eliminate this. Another way of solving this is by checking vegas_rtt_calc() and see if it gets -1 and handle it there. Another thing that I don't understand is that some places like tcp_ack() declare seq_rtt as signed and some vegas_cong_avoid() declare it as unsigned. Shouldn't it be declared always as signed? On 5/29/07, Stephen Hemminger [EMAIL PROTECTED] wrote: On Tue, 29 May 2007 12:18:19 +0300 Lior Dotan [EMAIL PROTECTED] wrote: Hi, I had a divide by zero on kernel 2.4.33 running with Vegas enabled. I don't think anyone has backported the other vegas fixes from 2.6.21 to 2.4. For 2.6.22, rtt_calc doesn't exist, instead pkts_acked is called. I tried to figure this one out yesterday, and it seems to me that divide by zero should not occur with 2.6.22 (nor with 2.6.21 code), no matter how I try to approach vegas... The following should be added to avoid getting bogus timestamp values from retransmits. ...but this is unreliable timestamps problem is a real one that originates from API merge commit 164891aadf1721fca4dce473bb0e0998181537c6 of yours (see some though about it below). I suppose there are now similar concerns about timestamp validity in other cc modules than vegas too. --- a/net/ipv4/tcp_vegas.c2007-05-02 12:26:35.0 -0700 +++ b/net/ipv4/tcp_vegas.c2007-05-29 14:06:26.0 -0700 @@ -117,6 +117,10 @@ void tcp_vegas_pkts_acked(struct sock *s struct vegas *vegas = inet_csk_ca(sk); u32 vrtt; + /* Ignore retransmits */ + if (unlikely(cnt == 0)) + return; + /* Never allow zero rtt or baseRTT */ vrtt = ktime_to_us(net_timedelta(last)) + 1; ...I don't think this works, because cnt won't be zero ever because to make the call some skbs were cleaned from the queue (isn't that what pkts_acked stands for?). There is as if (ackedFLAG_ACKED) guard before making the pkts_acked call to cc modules, thus delta in packets_out will always be greater than 0. Hmm, now that I realize this, I would say that checks for 0 in pkts_acked are entirely bogus (in the current code), hmm, that means I have more things to cleanup in tcp-2.6, at least, bic, cubic and westwood do them... :-). I think the code did a right thing before your api merge, since it called rtt callback only if FLAG_RETRANS_DATA_ACKED was not set (and pkts_acked always), i.e., when TCP cannot know if it was the rexmission that triggered the cumulative ACK or any original transmission, no new RTT measurement should be made. Consider also the fact that, there might be a non rexmitted skb after rexmitted one, which Lior's patch doesn't do right way at all since the FLAG_DATA_ACKED would again get set (no div-by-zero would have occured then but an unreliable timestamp would have been givento vegas in 2.4). The number of packets that got cumulatively acked is never a right metric to measure whether the cumulative ACK originated from rexmitted skbs or not. ...Perhaps the FLAG_RETRANS_DATA_ACKED flag needs to be passed somehow to cc modules? -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules)
On Tue, 29 May 2007, Ilpo Järvinen wrote: [PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules) The code used to ignore GSO completely, passing either way too small or zero pkts_acked when GSO skb or part of it got ACKed. In addition, there is no need to calculate the value in the loop but simple arithmetics after the loop is sufficient. Pkts_acked callback of congestion control modules include SYN as a packet for now on. Based on feedback from Stephen, I changed the commit msg bit clearer, the patch remains the same. [PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules) The code used to ignore GSO completely, passing either way too small or zero pkts_acked when GSO skb or part of it got ACKed. In addition, there is no need to calculate the value in the loop but simple arithmetics after the loop is sufficient. There is no need to handle SYN case specially because congestion control modules are not yet initialized when FLAG_SYN_ACKED is set. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 38cb25b..74683d8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2407,8 +2407,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; + int prior_packets = tp-packets_out; __s32 seq_rtt = -1; - u32 pkts_acked = 0; ktime_t last_ackt = ktime_set(0,0); while ((skb = tcp_write_queue_head(sk)) @@ -2437,7 +2437,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) */ if (!(scb-flags TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; - ++pkts_acked; } else { acked |= FLAG_SYN_ACKED; tp-retrans_stamp = 0; @@ -2481,6 +2480,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } if (ackedFLAG_ACKED) { + u32 pkts_acked = prior_packets - tp-packets_out; const struct tcp_congestion_ops *ca_ops = inet_csk(sk)-icsk_ca_ops; -- 1.5.0.6
Re: [PATCH-2.4] Fix divide by 0 in vegas_cong_avoid()
On Wed, 30 May 2007, Stephen Hemminger wrote: On Wed, 30 May 2007 10:49:54 +0300 (EEST) Ilpo Järvinen [EMAIL PROTECTED] wrote: I think the code did a right thing before your api merge, since it called rtt callback only if FLAG_RETRANS_DATA_ACKED was not set (and pkts_acked always), i.e., when TCP cannot know if it was the rexmission that triggered the cumulative ACK or any original transmission, no new RTT measurement should be made. Consider also the fact that, there might be a non rexmitted skb after rexmitted one, which Lior's patch doesn't do right way at all since the FLAG_DATA_ACKED would again get set (no div-by-zero would have occured then but an unreliable timestamp would have been givento vegas in 2.4). The number of packets that got cumulatively acked is never a right metric to measure whether the cumulative ACK originated from rexmitted skbs or not. ...Perhaps the FLAG_RETRANS_DATA_ACKED flag needs to be passed somehow to cc modules? What about the mixed case where some retransmitted data and new data was covered by one ack. Without timestamps TCP cannot do anything better than consider a cumulative ACK that includes any ever retransmitted segment as inaccurate timestamp. Like in tcp_ack_no_tstamp: if (flag FLAG_RETRANS_DATA_ACKED) return; ...I think this same applies to cc modules as well. With timestamps things are another beast. I would rather keep the merge, but think about the cases more carefully. IMHO there's no problem with keeping the merge, you just need to add another parameter to pkts_acked (or by some other means indicate it) to distinguish RTT measurement with accurate timestamp from inaccurate ones. Maybe the first step (to 2.6.22 to avoid new bugs etc.) should be just to give the flag (for some funny reason, it's called acked in tcp_clean_rtx_queue) or use a single arg with 0/1 value fo depending on FLAG_RETRANS_DATA_ACKED and check that in the merged pkts_acked in the cases that were using the old rtt callback. It would allow retaining the previous (correct) functionality as it was before the API merge. That would prevent potential regressions in cc modules due to this merge. ...Then perhaps come up with a solution that takes advantage of timestamps more aggressively to 2.6.23 like the core rtt estimator already does. -- i.
Re: [PATCH 0/9]: tcp-2.6 patchset
On Tue, 29 May 2007, Stephen Hemminger wrote: On Mon, 28 May 2007 13:27:03 +0300 (EEST) Ilpo Järvinen [EMAIL PROTECTED] wrote: On Sun, 27 May 2007, Ilpo Järvinen wrote: [PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules) Yes, thanks for fixing this. Wonder how it affects measurements. ...It's a bit hard to tell since dynamics change so dramatically in 0 check cases, the resulting behavior in too small value cases may be easier to predict... It's possible that this could explain some anomalities you've been seeing in your measurements. It is not very clear how SYN segments should be handled, so I choose to follow the previous implementation in this respect. Since we don't invoke congestion control modules until after the SYN handshake this is not a problem. Just curious, do you mean that cc modules cannot measure, e.g., initial RTT through this mechanism (though they could do that in init() cb then I suppose)... Or do you mean that they are called already for the ACK that completes the SYN handshake and therefore its skb is being cleaned from the queue right now (this is the case I above refer to)? In the first case the decrementer code is NOP. If the latter, then it is just interface specification question, i.e., if SYNs are treated as zero or one in num_acked for the pkts_acked callback (I have no opinion on this but was just trying to make sure cc modules get what they expect :-)). -- i.
Re: [PATCH 0/9]: tcp-2.6 patchset
On Tue, 29 May 2007, Stephen Hemminger wrote: On Tue, 29 May 2007 23:07:00 +0300 (EEST) Ilpo Järvinen [EMAIL PROTECTED] wrote: On Tue, 29 May 2007, Stephen Hemminger wrote: Since we don't invoke congestion control modules until after the SYN handshake this is not a problem. Just curious, do you mean that cc modules cannot measure, e.g., initial RTT through this mechanism (though they could do that in init() cb then I suppose)... Or do you mean that they are called already for the ACK that completes the SYN handshake and therefore its skb is being cleaned from the queue right now (this is the case I above refer to)? In the first case the decrementer code is NOP. If the latter, then it is just interface specification question, i.e., if SYNs are treated as zero or one in num_acked for the pkts_acked callback (I have no opinion on this but was just trying to make sure cc modules get what they expect :-)). We don't switch a socket out of Reno until after the initial handshake. ...It's still not very clear to me what exactly your after means (both here and in your earlier description), i.e., whether clean_rtx_queue call that cleans SYN skb from the queue happens before or after the switch out of reno or not... If I understand the code correctly, this specific clean_rtx_queue call happens after your after but I could be misunderstanding the current 3-way handshake code. :-) As an interface, it makes sense to keep the API with the SYN counting as a packet. Ok, this one answers the remaining question concerning the patch, here is it without the decrementer for SYN case (which IMHO wasn't very beautiful looking anyway :-)). Dave, please consider this to net-2.6. It could be a stable candidate as well, haven't tested yet if it applies cleanly to stable: [PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules) The code used to ignore GSO completely, passing either way too small or zero pkts_acked when GSO skb or part of it got ACKed. In addition, there is no need to calculate the value in the loop but simple arithmetics after the loop is sufficient. Pkts_acked callback of congestion control modules include SYN as a packet for now on. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c |4 ++-- 1 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 38cb25b..74683d8 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2407,8 +2407,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; + int prior_packets = tp-packets_out; __s32 seq_rtt = -1; - u32 pkts_acked = 0; ktime_t last_ackt = ktime_set(0,0); while ((skb = tcp_write_queue_head(sk)) @@ -2437,7 +2437,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) */ if (!(scb-flags TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; - ++pkts_acked; } else { acked |= FLAG_SYN_ACKED; tp-retrans_stamp = 0; @@ -2481,6 +2480,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } if (ackedFLAG_ACKED) { + u32 pkts_acked = prior_packets - tp-packets_out; const struct tcp_congestion_ops *ca_ops = inet_csk(sk)-icsk_ca_ops; -- 1.5.0.6
Re: [PATCH 0/9]: tcp-2.6 patchset
On Sun, 27 May 2007, Ilpo Järvinen wrote: On Sun, 27 May 2007, Baruch Even wrote: * Ilpo J?rvinen [EMAIL PROTECTED] [070527 14:16]: Thus, my original question basically culminates in this: should cc modules be passed number of packets acked or number of skbs acked? ...The latter makes no sense to me unless the value is intented to be interpreted as number of timestamps acked or something along those lines. ...I briefly tried looking up for documentation for cc module interface but didn't find anything useful about this, and thus asked in the first place... At least the htcp module that I wrote assumes that the number is actual number of tcp packets so GSO should be considered. Thanks for the info! It is what I suspected... ...I'll write a patch for it tomorrow against net-2.6... Dave, beware that it will partially overlap with the changes made in the patch 8, so you might choose to put the patch 8 on hold until this issue is first resolved... The consequences of this bug are not too large but it does make all congestion control algorithms a lot less aggressive. On my machines GSO is disabled by default (e1000 at 100mbps Tigon3 @ 1Gbps). Agreed, that's my impression too. However, some algorithms do things like 0 checks for it, so it might disturb their dynamics even more than in the too small value cases... Hmm, there seems to be another case that I'm not too sure of... Please check the alternative I choose for SYN handling below... ...hmm... While exploring this SYN thingie, I noticed that commit 164891aadf1721fca4dce473bb0e0998181537c6 drops !FLAG_RETRANS_DATA_ACKED check from rtt_sample call (when combining it with pkts_acked call). I hope that's intentional?!? ...the commit message didn't say anything about it nor was anything in cc modules changed to accomodate that. [PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules) The code used to ignore GSO completely, passing either way too small or zero pkts_acked when GSO skb or part of it got ACKed. In addition, there is no need to calculate the value in the loop but simple arithmetics after the loop is sufficient. It is not very clear how SYN segments should be handled, so I choose to follow the previous implementation in this respect. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c |7 +-- 1 files changed, 5 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 38cb25b..7dfaabb 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2407,8 +2407,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) struct sk_buff *skb; __u32 now = tcp_time_stamp; int acked = 0; + int prior_packets = tp-packets_out; __s32 seq_rtt = -1; - u32 pkts_acked = 0; ktime_t last_ackt = ktime_set(0,0); while ((skb = tcp_write_queue_head(sk)) @@ -2437,7 +2437,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) */ if (!(scb-flags TCPCB_FLAG_SYN)) { acked |= FLAG_DATA_ACKED; - ++pkts_acked; } else { acked |= FLAG_SYN_ACKED; tp-retrans_stamp = 0; @@ -2481,9 +2480,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } if (ackedFLAG_ACKED) { + u32 pkts_acked = prior_packets - tp-packets_out; const struct tcp_congestion_ops *ca_ops = inet_csk(sk)-icsk_ca_ops; + if (acked FLAG_SYN_ACKED) + pkts_acked--; + tcp_ack_update_rtt(sk, acked, seq_rtt); tcp_ack_packets_out(sk); -- 1.5.0.6
Re: [PATCH 6/9] [TCP]: Reorganize lost marking code
On Sat, 26 May 2007, Ilpo Järvinen wrote: -static void tcp_update_scoreboard_fack(struct sock *sk, u32 entry_seq, -int fast_rexmit) +struct sk_buff *tcp_update_scoreboard_fack(struct sock *sk, u32 entry_seq, +int fast_rexmit) Noticed that the static got dropped here unintentionally, 2nd try included below. [PATCH 6/9] [TCP]: Reorganize lost marking code The indentation started to get scary, so I reorganized code so that some trivial ifs are in tcp_update_scoreboard and the main loops remain in tcp_update_scoreboard_fack. It's much easier to view the actual changes with git-diff -w than from the messy looking (default) diff. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 157 +- 1 files changed, 79 insertions(+), 78 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 452a3b0..c5eda37 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1866,7 +1866,6 @@ static void tcp_mark_head_lost_single(struct sock *sk) before(tp-snd_una, tp-high_seq)) { TCP_SKB_CB(skb)-sacked |= TCPCB_LOST; tp-lost_out += tcp_skb_pcount(skb); - tcp_sync_left_out(tp); } } @@ -1905,99 +1904,85 @@ static void tcp_mark_head_lost_single(struct sock *sk) * Key difference here is that FACK uses both SACK blocks and holes while * RFC3517 is using only SACK blocks when counting for reordering. */ -static void tcp_update_scoreboard_fack(struct sock *sk, u32 entry_seq, - int fast_rexmit) +static struct sk_buff *tcp_update_scoreboard_fack(struct sock *sk, + u32 entry_seq, + int fast_rexmit) { struct tcp_sock *tp = tcp_sk(sk); /* Beware: timedout_continue might be pointing to sk_write_queue */ struct sk_buff *timedout_continue = NULL; struct sk_buff *skb; - - if (!tp-sacked_out) { - if (IsFack(tp) || fast_rexmit) { - tcp_mark_head_lost_single(sk); - skb = tcp_write_queue_head(sk); - timedout_continue = tcp_write_queue_next(sk, skb); - } - + unsigned int holes_seen = 0; + + skb = tcp_write_queue_find(sk, entry_seq); + /* If this ever becomes expensive, it can be delayed */ + timedout_continue = tcp_write_queue_next(sk, skb); + if (entry_seq != tp-highest_sack) { + /* Not interested in the last SACKed one we got */ + /* RFC: find_below could help here too */ + skb = tcp_write_queue_prev(sk, skb); + if (IsFack(tp) tcp_skb_timedout(sk, skb) + (tp-fackets_out tp-packets_out)) { + timedout_continue = tcp_write_queue_find(sk, tp-highest_sack); + timedout_continue = tcp_write_queue_next(sk, timedout_continue); + } else + timedout_continue = NULL; } else { - unsigned int holes_seen = 0; - - skb = tcp_write_queue_find(sk, entry_seq); - /* If this ever becomes expensive, it can be delayed */ - timedout_continue = tcp_write_queue_next(sk, skb); - if (entry_seq != tp-highest_sack) { - /* Not interested in the last SACKed one we got */ - /* RFC: find_below could help here too */ - skb = tcp_write_queue_prev(sk, skb); - if (IsFack(tp) tcp_skb_timedout(sk, skb) - (tp-fackets_out tp-packets_out)) { - timedout_continue = tcp_write_queue_find(sk, tp-highest_sack); - timedout_continue = tcp_write_queue_next(sk, timedout_continue); - } else - timedout_continue = NULL; - } else { - unsigned int reord_count = IsFack(tp) ? 0 : 1; - - /* Phase I: Search until TCP can mark */ - tcp_for_write_queue_backwards_from(skb, sk) { - if ((tp-fackets_out = tp-sacked_out + - tp-lost_out + - holes_seen) || - (TCP_SKB_CB(skb)-sacked TCPCB_LOST)) - goto backwards_walk_done; - - if (IsFack(tp) tcp_skb_timedout(sk, skb)) - break; - else - timedout_continue = NULL; - - if (IsFack(tp
Re: [PATCH 0/9]: tcp-2.6 patchset
On Sat, 26 May 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Sat, 26 May 2007 11:35:53 +0300 Dave, you could consider applying other than the last one if they seem ok to you too (you'll need to rebase your tcp-2.6 in that case first to apply cleanly those that touch tcp_sync_left_out :-)). Absolutely, I'll do the quick rebase of tcp-2.6 and review + applying of your patches (except the last one) over the weekend. While you're in the right context (reviewing patch 8), you could also look if tcp_clean_rtx_queue does a right thing when passing a strange pkts_acked to congestion control modules. I wonder if it really should ignore GSO the way it does currently... I read some cc module code and some was adding it to snd_cwnd_cnt, etc. which is a strong indication that GSO should be considered... Also if the head is GSO skb that is not completely acked, the loop breaks with pkts_acked being zero, I doubt that can be correct... ...added Stephen H. Herbert X. to this thread since also they might have some idea about it. ...Anyway, if it's wrong, the fix should go to net-2.6 rather than to tcp-2.6... -- i.
Re: [PATCH 0/9]: tcp-2.6 patchset
On Sun, 27 May 2007, David Miller wrote: From: Ilpo_Järvinen [EMAIL PROTECTED] Date: Sun, 27 May 2007 10:58:27 +0300 (EEST) While you're in the right context (reviewing patch 8), you could also look if tcp_clean_rtx_queue does a right thing when passing a strange pkts_acked to congestion control modules. I wonder if it really should ignore GSO the way it does currently... I read some cc module code and some was adding it to snd_cwnd_cnt, etc. which is a strong indication that GSO should be considered... Also if the head is GSO skb that is not completely acked, the loop breaks with pkts_acked being zero, I doubt that can be correct... [...snip...] will likely take a look at these issues wrt. patch 8 tomorrow. ...I hope I got myself understood correctly (with my non-native English :-))... ...My intention was to say that there might be a bug in tcp_clean_rtx_queue too, which on logical level is unrelated to the patch 8 itself and to the change it makes (but caught my attention while I was doing that patch as it resides in the same function). ...There could have been confusion since I'm too changing things in the same function and because my article usage is usually far from perfect... :-) Also the fastretrans_alert arg rename to the same name (i.e., pkts_acked) in patch 8 is unrelated to the potential bug. Thus, my original question basically culminates in this: should cc modules be passed number of packets acked or number of skbs acked? ...The latter makes no sense to me unless the value is intented to be interpreted as number of timestamps acked or something along those lines. ...I briefly tried looking up for documentation for cc module interface but didn't find anything useful about this, and thus asked in the first place... -- i.
Re: [PATCH 0/9]: tcp-2.6 patchset
On Sun, 27 May 2007, Baruch Even wrote: * Ilpo J?rvinen [EMAIL PROTECTED] [070527 14:16]: Thus, my original question basically culminates in this: should cc modules be passed number of packets acked or number of skbs acked? ...The latter makes no sense to me unless the value is intented to be interpreted as number of timestamps acked or something along those lines. ...I briefly tried looking up for documentation for cc module interface but didn't find anything useful about this, and thus asked in the first place... At least the htcp module that I wrote assumes that the number is actual number of tcp packets so GSO should be considered. Thanks for the info! It is what I suspected... ...I'll write a patch for it tomorrow against net-2.6... Dave, beware that it will partially overlap with the changes made in the patch 8, so you might choose to put the patch 8 on hold until this issue is first resolved... The consequences of this bug are not too large but it does make all congestion control algorithms a lot less aggressive. On my machines GSO is disabled by default (e1000 at 100mbps Tigon3 @ 1Gbps). Agreed, that's my impression too. However, some algorithms do things like 0 checks for it, so it might disturb their dynamics even more than in the too small value cases... -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/9]: tcp-2.6 patchset
Hi, Here are some changes to TCP I've been baking. Before doing this patchset, I rebased tcp-2.6 branch to the current net-2.6 (goes almost cleanly) because there are some depencies to the TCP work in there. I booted these today and no very obvious problems showed up (OOPSes, BUG()s, reported scoreboard leaks, etc.) while I did some transfers with varying sysctl settings: SACK/FACK on/off and FRTO enabled). However, no in depth analysis on behavior has been made. The last patch is highly experimental (and incomplete), it should NOT be applied yet. I suspect that there are at least off-by-MSS things in it. I'm asking for your comments. Dave, you could consider applying other than the last one if they seem ok to you too (you'll need to rebase your tcp-2.6 in that case first to apply cleanly those that touch tcp_sync_left_out :-)). -- i. - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/9] [TCP] FRTO: remove unnecessary fackets/sacked_out recounting
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] F-RTO does not touch SACKED_ACKED bits at all, so there is no need to recount them in tcp_enter_frto_loss. After removal of the else branch, nested ifs can be combined. This must also reset sacked_out when SACK is not in use as TCP could have received some duplicate ACKs prior RTO. To achieve that in a sane manner, tcp_reset_reno_sack was re-placed by the previous patch. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 23 +++ 1 files changed, 7 insertions(+), 16 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 925a26f..4dab5d7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1479,17 +1479,15 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt = 0; - tp-sacked_out = 0; tp-lost_out = 0; - tp-fackets_out = 0; tp-retrans_out = 0; + if (IsReno(tp)) + tcp_reset_reno_sack(tp); tcp_for_write_queue(skb, sk) { if (skb == tcp_send_head(sk)) break; - cnt += tcp_skb_pcount(skb); /* * Count the retransmission made on RTO correctly (only when * waiting for the first ACK and did not get it)... @@ -1501,19 +1499,12 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) } else { TCP_SKB_CB(skb)-sacked = ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); } - if (!(TCP_SKB_CB(skb)-sackedTCPCB_SACKED_ACKED)) { - /* Do not mark those segments lost that were -* forward transmitted after RTO -*/ - if (!after(TCP_SKB_CB(skb)-end_seq, - tp-frto_highmark)) { - TCP_SKB_CB(skb)-sacked |= TCPCB_LOST; - tp-lost_out += tcp_skb_pcount(skb); - } - } else { - tp-sacked_out += tcp_skb_pcount(skb); - tp-fackets_out = cnt; + /* Don't lost mark skbs that were fwd transmitted after RTO */ + if (!(TCP_SKB_CB(skb)-sackedTCPCB_SACKED_ACKED) + !after(TCP_SKB_CB(skb)-end_seq, tp-frto_highmark)) { + TCP_SKB_CB(skb)-sacked |= TCPCB_LOST; + tp-lost_out += tcp_skb_pcount(skb); } } tcp_sync_left_out(tp); -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 3/9] [TCP]: Tighten tcp_sock's belt, drop left_out
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] It is easily calculable when needed and user are not that many after all. Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- include/linux/tcp.h |1 - include/net/tcp.h|4 ++-- net/ipv4/tcp_input.c | 11 ++- net/ipv4/tcp_minisocks.c |1 - net/ipv4/tcp_output.c| 12 +++- 5 files changed, 7 insertions(+), 22 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 728321f..63e04f4 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -305,7 +305,6 @@ struct tcp_sock { u32 rtt_seq;/* sequence number to update rttvar */ u32 packets_out;/* Packets which are in flight*/ - u32 left_out; /* Packets which leaved network */ u32 retrans_out;/* Retransmitted packets out*/ /* * Options received (usually on last packet, some only on SYN packets). diff --git a/include/net/tcp.h b/include/net/tcp.h index dd58cf2..5d866a4 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -716,7 +716,8 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) */ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { - return (tp-packets_out - tp-left_out + tp-retrans_out); + return tp-packets_out - (tp-sacked_out + tp-lost_out) + + tp-retrans_out; } /* If cwnd ssthresh, we may raise ssthresh to be half-way to cwnd. @@ -738,7 +739,6 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp) { BUG_ON(tp-rx_opt.sack_ok (tp-sacked_out + tp-lost_out tp-packets_out)); - tp-left_out = tp-sacked_out + tp-lost_out; } extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4dab5d7..c17e077 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1301,8 +1301,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, } } - tp-left_out = tp-sacked_out + tp-lost_out; - if ((state.reord tp-fackets_out) icsk-icsk_ca_state != TCP_CA_Loss (!tp-frto_highmark || after(tp-snd_una, tp-frto_highmark))) tcp_update_reordering(sk, ((tp-fackets_out + 1) - state.reord), 0); @@ -1363,7 +1361,6 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked) static inline void tcp_reset_reno_sack(struct tcp_sock *tp) { tp-sacked_out = 0; - tp-left_out = tp-lost_out; } int tcp_use_frto(struct sock *sk) @@ -1526,7 +1523,6 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) void tcp_clear_retrans(struct tcp_sock *tp) { - tp-left_out = 0; tp-retrans_out = 0; tp-fackets_out = 0; @@ -2023,7 +2019,7 @@ static void DBGUNDO(struct sock *sk, const char *msg) printk(KERN_DEBUG Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n, msg, NIPQUAD(inet-daddr), ntohs(inet-dport), - tp-snd_cwnd, tp-left_out, + tp-snd_cwnd, tp-sacked_out + tp-lost_out, tp-snd_ssthresh, tp-prior_ssthresh, tp-packets_out); } @@ -2152,7 +2148,6 @@ static int tcp_try_undo_loss(struct sock *sk) DBGUNDO(sk, partial loss); tp-lost_out = 0; - tp-left_out = tp-sacked_out; tcp_undo_cwr(sk, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)-icsk_retransmits = 0; @@ -2176,8 +2171,6 @@ static void tcp_try_to_open(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - tp-left_out = tp-sacked_out; - if (tp-retrans_out == 0) tp-retrans_stamp = 0; @@ -2187,7 +2180,7 @@ static void tcp_try_to_open(struct sock *sk, int flag) if (inet_csk(sk)-icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; - if (tp-left_out || tp-retrans_out || tp-undo_marker) + if (tp-sacked_out || tp-retrans_out || tp-undo_marker) state = TCP_CA_Disorder; if (inet_csk(sk)-icsk_ca_state != state) { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 42588eb..598ddec 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -399,7 +399,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, newicsk-icsk_rto = TCP_TIMEOUT_INIT; newtp-packets_out = 0; - newtp-left_out = 0; newtp-retrans_out = 0; newtp-sacked_out = 0; newtp-fackets_out = 0; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8891775..d67fb3d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -682,10 +682,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff
[PATCH 1/9] [TCP]: Move Reno SACKed_out counter functions earlier
From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED] Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED] --- net/ipv4/tcp_input.c | 98 -- 1 files changed, 47 insertions(+), 51 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cb81e31..925a26f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -1319,6 +1319,53 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, /* F-RTO can only be used if TCP has never retransmitted anything other than * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ +static void tcp_check_reno_reordering(struct sock *sk, const int addend) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 holes; + + holes = max(tp-lost_out, 1U); + holes = min(holes, tp-packets_out); + + if ((tp-sacked_out + holes) tp-packets_out) { + tp-sacked_out = tp-packets_out - holes; + tcp_update_reordering(sk, tp-packets_out + addend, 0); + } +} + +/* Emulate SACKs for SACKless connection: account for a new dupack. */ + +static void tcp_add_reno_sack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + tp-sacked_out++; + tcp_check_reno_reordering(sk, 0); + tcp_sync_left_out(tp); +} + +/* Account for ACK, ACKing some data in Reno Recovery phase. */ + +static void tcp_remove_reno_sacks(struct sock *sk, int acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (acked 0) { + /* One ACK acked hole. The rest eat duplicate ACKs. */ + if (acked-1 = tp-sacked_out) + tp-sacked_out = 0; + else + tp-sacked_out -= acked-1; + } + tcp_check_reno_reordering(sk, acked); + tcp_sync_left_out(tp); +} + +static inline void tcp_reset_reno_sack(struct tcp_sock *tp) +{ + tp-sacked_out = 0; + tp-left_out = tp-lost_out; +} + int tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); @@ -1734,57 +1781,6 @@ static int tcp_time_to_recover(struct sock *sk) return 0; } -/* If we receive more dupacks than we expected counting segments - * in assumption of absent reordering, interpret this as reordering. - * The only another reason could be bug in receiver TCP. - */ -static void tcp_check_reno_reordering(struct sock *sk, const int addend) -{ - struct tcp_sock *tp = tcp_sk(sk); - u32 holes; - - holes = max(tp-lost_out, 1U); - holes = min(holes, tp-packets_out); - - if ((tp-sacked_out + holes) tp-packets_out) { - tp-sacked_out = tp-packets_out - holes; - tcp_update_reordering(sk, tp-packets_out + addend, 0); - } -} - -/* Emulate SACKs for SACKless connection: account for a new dupack. */ - -static void tcp_add_reno_sack(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - tp-sacked_out++; - tcp_check_reno_reordering(sk, 0); - tcp_sync_left_out(tp); -} - -/* Account for ACK, ACKing some data in Reno Recovery phase. */ - -static void tcp_remove_reno_sacks(struct sock *sk, int acked) -{ - struct tcp_sock *tp = tcp_sk(sk); - - if (acked 0) { - /* One ACK acked hole. The rest eat duplicate ACKs. */ - if (acked-1 = tp-sacked_out) - tp-sacked_out = 0; - else - tp-sacked_out -= acked-1; - } - tcp_check_reno_reordering(sk, acked); - tcp_sync_left_out(tp); -} - -static inline void tcp_reset_reno_sack(struct tcp_sock *tp) -{ - tp-sacked_out = 0; - tp-left_out = tp-lost_out; -} - /* RFC: This is from the original, I doubt that this is necessary at all: * clear xmit_retrans hint if seq of this skb is beyond hint. How could we * retransmitted past LOST markings in the first place? I'm not fully sure -- 1.5.0.6 - To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html