from:"Ilpo Järvinen"

[RFC PATCH net-2.6.24 0/5]: TCP sacktag cache usage recoded

2007-09-24 Thread Ilpo Järvinen

Hi all,

After couple of wrong-wayed before/after()s and one infinite
loopy version, here's the current trial version of a sacktag  
cache usage recode

Two first patches come from tcp-2.6 (rebased and rotated).
This series apply cleanly only on top of the other three patch
series I posted earlier today. The last debug patch provides
some statistics for those interested enough.

Dave, please DO NOT apply! ...Some thoughts could be nice
though :-).

It should improve processing of such likely events as cumulative
ACKs and new forward holed SACK considerably because full walk  
is not necessary anymore (old code could have been tweaked to   
cover them but it's better to drop each special case handling   
altogether and do a generic solution. Redundancy of fastpath
hints and highest_sack stuff is also addressed, however, it 
might have slight effect as the hint could point to something   
less than highest_sack occassionally, whether that's significant
remains to see... In all cases except hint below highest_sack,  
the new solution should perform at least as well as the old code
(with a bit larger constant though, no additional cache misses  
though) because the SACK blocks old code choose not to process  
should all fall to LINUX_MIB_TCP_FULLSKIP category.

It's possible to improve it easily with RB-tree stuff though
this version is based on code using linked lists. I'm not yet
too sure that I got everything 100% correct as I tweak
start/end_seqs and exit skb loops a way that is prone to
off-by-one errors, could miss skb here and there. I'll probably
also recode dsack handling too to avoid recursion.

Stephen, Sangtae, others? experiencing those unexpected RTOs
during recovery of large windowed TCP, could you please give it
a try if it helps any...

--
 i.
   
ps. Our net-2.6.24 (and mainline?) DSACK processing code could
be broken btw. DSACK in the middle of other SACK block during
in-order walk seems to not be processed at all as the earlier
(sorted) block caused skb to advance past it already? (Just
occurred to me, I'll see what I can do to that if I've enough
time).


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/5] [TCP]: Create tcp_sacktag_state.

2007-09-24 Thread Ilpo Järvinen

From: David S. Miller [EMAIL PROTECTED]

It is difficult to break out the inner-logic of
tcp_sacktag_write_queue() into worker functions because
so many local variables get updated in-place.

Start to overcome this by creating a structure block
of state variables that can be passed around into
worker routines.

[I made minor tweaks due to rebase/reordering of stuff
in tcp-2.6 tree, and dropped found_dup_sack  dup_sack
from state -ij]

Signed-off-by: David S. Miller [EMAIL PROTECTED]
Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |   89 ++---
 1 files changed, 47 insertions(+), 42 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 259f517..04ff465 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1102,6 +1102,13 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, 
int is_dsack,
return !before(start_seq, end_seq - tp-max_window);
 }
 
+struct tcp_sacktag_state {
+   unsigned int flag;
+   int reord;
+   int prior_fackets;
+   u32 lost_retrans;
+   int first_sack_index;
+};
 
 static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
   struct tcp_sack_block_wire *sp, int num_sacks,
@@ -1146,25 +1153,22 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
struct sk_buff *cached_skb;
int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)3;
-   int reord = tp-packets_out;
-   int prior_fackets;
-   u32 lost_retrans = 0;
-   int flag = 0;
-   int found_dup_sack = 0;
+   struct tcp_sacktag_state state;
+   int found_dup_sack;
int cached_fack_count;
int i;
-   int first_sack_index;
+   int force_one_sack;
+
+   state.flag = 0;
 
if (!tp-sacked_out) {
tp-fackets_out = 0;
tp-highest_sack = tp-snd_una;
}
-   prior_fackets = tp-fackets_out;
 
-   found_dup_sack = tcp_check_dsack(tp, ack_skb, sp,
-num_sacks, prior_snd_una);
+   found_dup_sack = tcp_check_dsack(tp, ack_skb, sp, num_sacks, 
prior_snd_una);
if (found_dup_sack)
-   flag |= FLAG_DSACKING_ACK;
+   state.flag |= FLAG_DSACKING_ACK;
 
/* Eliminate too old ACKs, but take into
 * account more or less fresh ones, they can
@@ -1177,18 +1181,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
 * if the only SACK change is the increase of the end_seq of
 * the first block then only apply that SACK block
 * and use retrans queue hinting otherwise slowpath */
-   flag = 1;
+   force_one_sack = 1;
for (i = 0; i  num_sacks; i++) {
__be32 start_seq = sp[i].start_seq;
__be32 end_seq = sp[i].end_seq;
 
if (i == 0) {
if (tp-recv_sack_cache[i].start_seq != start_seq)
-   flag = 0;
+   force_one_sack = 0;
} else {
if ((tp-recv_sack_cache[i].start_seq != start_seq) ||
(tp-recv_sack_cache[i].end_seq != end_seq))
-   flag = 0;
+   force_one_sack = 0;
}
tp-recv_sack_cache[i].start_seq = start_seq;
tp-recv_sack_cache[i].end_seq = end_seq;
@@ -1199,8 +1203,8 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
tp-recv_sack_cache[i].end_seq = 0;
}
 
-   first_sack_index = 0;
-   if (flag)
+   state.first_sack_index = 0;
+   if (force_one_sack)
num_sacks = 1;
else {
int j;
@@ -1218,17 +1222,14 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
sp[j+1] = tmp;
 
/* Track where the first SACK block 
goes to */
-   if (j == first_sack_index)
-   first_sack_index = j+1;
+   if (j == state.first_sack_index)
+   state.first_sack_index = j+1;
}
 
}
}
}
 
-   /* clear flag as used for different purpose in following code */
-   flag = 0;
-
/* Use SACK fastpath hint if valid */
cached_skb = tp-fastpath_skb_hint;
cached_fack_count = tp-fastpath_cnt_hint;
@@ -1237,12 +1238,16 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
cached_fack_count = 0;
}
 
+   state.reord = tp

[PATCH 2/5] [TCP]: Create tcp_sacktag_one().

2007-09-24 Thread Ilpo Järvinen

From: David S. Miller [EMAIL PROTECTED]

Worker function that implements the main logic of
the inner-most loop of tcp_sacktag_write_queue().

Signed-off-by: David S. Miller [EMAIL PROTECTED]
Acked-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |  213 ++
 1 files changed, 110 insertions(+), 103 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 04ff465..76e9c9b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1143,6 +1143,114 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct 
sk_buff *ack_skb,
return dup_sack;
 }
 
+static void tcp_sacktag_one(struct sk_buff *skb, struct tcp_sock *tp,
+   struct tcp_sacktag_state *state, int in_sack,
+   int dup_sack, int fack_count, u32 end_seq)
+{
+   u8 sacked = TCP_SKB_CB(skb)-sacked;
+
+   /* Account D-SACK for retransmitted packet. */
+   if ((dup_sack  in_sack) 
+   (sacked  TCPCB_RETRANS) 
+   after(TCP_SKB_CB(skb)-end_seq, tp-undo_marker))
+   tp-undo_retrans--;
+
+   /* The frame is ACKed. */
+   if (!after(TCP_SKB_CB(skb)-end_seq, tp-snd_una)) {
+   if (sacked  TCPCB_RETRANS) {
+   if ((dup_sack  in_sack) 
+   (sacked  TCPCB_SACKED_ACKED))
+   state-reord = min(fack_count, state-reord);
+   } else {
+   /* If it was in a hole, we detected reordering. */
+   if (fack_count  state-prior_fackets 
+   !(sacked  TCPCB_SACKED_ACKED))
+   state-reord = min(fack_count, state-reord);
+   }
+
+   /* Nothing to do; acked frame is about to be dropped. */
+   return;
+   }
+
+   if ((sacked  TCPCB_SACKED_RETRANS) 
+   after(end_seq, TCP_SKB_CB(skb)-ack_seq) 
+   (!state-lost_retrans || after(end_seq, state-lost_retrans)))
+   state-lost_retrans = end_seq;
+
+   if (!in_sack)
+   return;
+
+   if (!(sacked  TCPCB_SACKED_ACKED)) {
+   if (sacked  TCPCB_SACKED_RETRANS) {
+   /* If the segment is not tagged as lost,
+* we do not clear RETRANS, believing
+* that retransmission is still in flight.
+*/
+   if (sacked  TCPCB_LOST) {
+   TCP_SKB_CB(skb)-sacked =
+   ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
+   tp-lost_out -= tcp_skb_pcount(skb);
+   tp-retrans_out -= tcp_skb_pcount(skb);
+
+   /* clear lost hint */
+   tp-retransmit_skb_hint = NULL;
+   }
+   } else {
+   /* New sack for not retransmitted frame,
+* which was in hole. It is reordering.
+*/
+   if (!(sacked  TCPCB_RETRANS) 
+   fack_count  state-prior_fackets)
+   state-reord = min(fack_count, state-reord);
+
+   if (sacked  TCPCB_LOST) {
+   TCP_SKB_CB(skb)-sacked = ~TCPCB_LOST;
+   tp-lost_out -= tcp_skb_pcount(skb);
+
+   /* clear lost hint */
+   tp-retransmit_skb_hint = NULL;
+   }
+   /* SACK enhanced F-RTO detection.
+* Set flag if and only if non-rexmitted
+* segments below frto_highmark are
+* SACKed (RFC4138; Appendix B).
+* Clearing correct due to in-order walk
+*/
+   if (after(end_seq, tp-frto_highmark)) {
+   state-flag = ~FLAG_ONLY_ORIG_SACKED;
+   } else {
+   if (!(sacked  TCPCB_RETRANS))
+   state-flag |= FLAG_ONLY_ORIG_SACKED;
+   }
+   }
+
+   TCP_SKB_CB(skb)-sacked |= TCPCB_SACKED_ACKED;
+   state-flag |= FLAG_DATA_SACKED;
+   tp-sacked_out += tcp_skb_pcount(skb);
+
+   if (fack_count  tp-fackets_out)
+   tp-fackets_out = fack_count;
+
+   if (after(TCP_SKB_CB(skb)-seq,
+ tp-highest_sack))
+   tp-highest_sack = TCP_SKB_CB(skb)-seq;
+   } else {
+   if (dup_sack  (sackedTCPCB_RETRANS))
+   state-reord = min(fack_count, state-reord);
+   }
+
+   /* D-SACK. We can detect redundant retransmission
+* in S|R and plain R frames and clear

[RFC PATCH 4/5] [TCP]: Rewrite sack_recv_cache (WIP)

2007-09-24 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

Previously a number of cases in TCP SACK processing fail to take
advantage of costly stored information in sack_recv_cache. Most
importantly expected events such as cumulative ACK, new hole
ACKs and first ACK after RTO fall to this category. Processing
on such ACKs result in rather long walks building up latencies
(which easily gets nasty when window is large), which are
completely unnecessary, usually no new information was gathered
except the new SACK block above the hole in the respective case.

Since the inclusion of highest_sack, there's a lot information
that is very likely redundant (SACK fastpath hint stuff,
fackets_out, highest_sack), though there's no ultimate guarantee
that they'll remain the same whole the time (in all unearthly
scenarios). Take advantage of this too and drop fastpath hint.
Effectively this drops special cased fastpath. This change
adds some complexity to introduce better coveraged fastpath.
The current ACK's SACK blocks are compared against each cached
block individially and only ranges that are new are then scanned
by the high constant walk. For other parts of write queue, even
when in previously known part of the SACK blocks, a faster skip
function is used. In addition, whenever possible, TCP
fast-forwards to highest_sack skb that was made available
earlier. In typical case, no other things but this fast-forward
and mandatory markings after that occur making the access
pattern quite similar to the former fastpath. DSACKs are special
case that must always be walked.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/linux/tcp.h   |4 +-
 include/net/tcp.h |1 -
 net/ipv4/tcp_input.c  |  320 ++--
 net/ipv4/tcp_output.c |   12 +--
 4 files changed, 202 insertions(+), 135 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1d6be2a..8d91eac 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -330,7 +330,7 @@ struct tcp_sock {
struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/
 
-   struct tcp_sack_block_wire recv_sack_cache[4];
+   struct tcp_sack_block recv_sack_cache[4];
 
struct sk_buff *highest_sack;   /* highest skb with SACK received
 * (validity guaranteed only if
@@ -343,9 +343,7 @@ struct tcp_sock {
struct sk_buff *scoreboard_skb_hint;
struct sk_buff *retransmit_skb_hint;
struct sk_buff *forward_skb_hint;
-   struct sk_buff *fastpath_skb_hint;
 
-   int fastpath_cnt_hint;
int lost_cnt_hint;
int retransmit_cnt_hint;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 8bc64b7..d5def9b 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1078,7 +1078,6 @@ static inline void tcp_clear_retrans_hints_partial(struct 
tcp_sock *tp)
 static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp)
 {
tcp_clear_retrans_hints_partial(tp);
-   tp-fastpath_skb_hint = NULL;
 }
 
 /* MD5 Signature */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 85dd4b0..9dfdd67 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1106,11 +1106,15 @@ struct tcp_sacktag_state {
unsigned int flag;
int reord;
int prior_fackets;
+   int fack_count;
u32 lost_retrans;
-   int first_sack_index;
+   u32 dup_start;
+   u32 dup_end;
 };
 
-static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
+static int tcp_check_dsack(struct tcp_sock *tp,
+  struct tcp_sacktag_state *state,
+  struct sk_buff *ack_skb,
   struct tcp_sack_block_wire *sp, int num_sacks,
   u32 prior_snd_una)
 {
@@ -1120,6 +1124,8 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct 
sk_buff *ack_skb,
 
if (before(start_seq_0, TCP_SKB_CB(ack_skb)-ack_seq)) {
dup_sack = 1;
+   state-dup_start = start_seq_0;
+   state-dup_end = end_seq_0;
tcp_dsack_seen(tp);
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks  1) {
@@ -1129,6 +1135,8 @@ static int tcp_check_dsack(struct tcp_sock *tp, struct 
sk_buff *ack_skb,
if (!after(end_seq_0, end_seq_1) 
!before(start_seq_0, start_seq_1)) {
dup_sack = 1;
+   state-dup_start = start_seq_1;
+   state-dup_end = end_seq_1;
tcp_dsack_seen(tp);
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
}
@@ -1251,6 +1259,104 @@ static void tcp_sacktag_one(struct sk_buff *skb, struct 
sock *sk,
}
 }
 
+static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk

[DEVELOPER PATCH 5/5] [TCP]: Track sacktag

2007-09-24 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/linux/snmp.h |   20 +++
 net/ipv4/proc.c  |   20 +++
 net/ipv4/tcp_input.c |   52 +++--
 3 files changed, 89 insertions(+), 3 deletions(-)

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 89f0c2b..42b8c07 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -214,6 +214,26 @@ enum
LINUX_MIB_TCPDSACKIGNOREDOLD,   /* TCPSACKIgnoredOld */
LINUX_MIB_TCPDSACKIGNOREDNOUNDO,/* TCPSACKIgnoredNoUndo */
LINUX_MIB_TCPSPURIOUSRTOS,  /* TCPSpuriousRTOs */
+   LINUX_MIB_TCP_SACKTAG,
+   LINUX_MIB_TCP_SACK0,
+   LINUX_MIB_TCP_SACK1,
+   LINUX_MIB_TCP_SACK2,
+   LINUX_MIB_TCP_SACK3,
+   LINUX_MIB_TCP_SACK4,
+   LINUX_MIB_TCP_WALKEDSKBS,
+   LINUX_MIB_TCP_WALKEDDSACKS,
+   LINUX_MIB_TCP_SKIPPEDSKBS,
+   LINUX_MIB_TCP_NOCACHE,
+   LINUX_MIB_TCP_FULLWALK,
+   LINUX_MIB_TCP_HEADWALK,
+   LINUX_MIB_TCP_TAILWALK,
+   LINUX_MIB_TCP_FULLSKIP,
+   LINUX_MIB_TCP_TAILSKIP,
+   LINUX_MIB_TCP_HEADSKIP,
+   LINUX_MIB_TCP_FULLSKIP_TOHIGH,
+   LINUX_MIB_TCP_TAILSKIP_TOHIGH,
+   LINUX_MIB_TCP_NEWSKIP,
+   LINUX_MIB_TCP_CACHEREMAINING,
__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 9dee70e..4808f82 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -246,6 +246,26 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM(TCPDSACKIgnoredOld, LINUX_MIB_TCPDSACKIGNOREDOLD),
SNMP_MIB_ITEM(TCPDSACKIgnoredNoUndo, LINUX_MIB_TCPDSACKIGNOREDNOUNDO),
SNMP_MIB_ITEM(TCPSpuriousRTOs, LINUX_MIB_TCPSPURIOUSRTOS),
+   SNMP_MIB_ITEM(TCP_SACKTAG, LINUX_MIB_TCP_SACKTAG),
+   SNMP_MIB_ITEM(TCP_SACK0, LINUX_MIB_TCP_SACK0),
+   SNMP_MIB_ITEM(TCP_SACK1, LINUX_MIB_TCP_SACK1),
+   SNMP_MIB_ITEM(TCP_SACK2, LINUX_MIB_TCP_SACK2),
+   SNMP_MIB_ITEM(TCP_SACK3, LINUX_MIB_TCP_SACK3),
+   SNMP_MIB_ITEM(TCP_SACK4, LINUX_MIB_TCP_SACK4),
+   SNMP_MIB_ITEM(TCP_WALKEDSKBS, LINUX_MIB_TCP_WALKEDSKBS),
+   SNMP_MIB_ITEM(TCP_WALKEDDSACKS, LINUX_MIB_TCP_WALKEDDSACKS),
+   SNMP_MIB_ITEM(TCP_SKIPPEDSKBS, LINUX_MIB_TCP_SKIPPEDSKBS),
+   SNMP_MIB_ITEM(TCP_NOCACHE, LINUX_MIB_TCP_NOCACHE),
+   SNMP_MIB_ITEM(TCP_FULLWALK, LINUX_MIB_TCP_FULLWALK),
+   SNMP_MIB_ITEM(TCP_HEADWALK, LINUX_MIB_TCP_HEADWALK),
+   SNMP_MIB_ITEM(TCP_TAILWALK, LINUX_MIB_TCP_TAILWALK),
+   SNMP_MIB_ITEM(TCP_FULLSKIP, LINUX_MIB_TCP_FULLSKIP),
+   SNMP_MIB_ITEM(TCP_TAILSKIP, LINUX_MIB_TCP_TAILSKIP),
+   SNMP_MIB_ITEM(TCP_HEADSKIP, LINUX_MIB_TCP_HEADSKIP),
+   SNMP_MIB_ITEM(TCP_FULLSKIP_TOHIGH, LINUX_MIB_TCP_FULLSKIP_TOHIGH),
+   SNMP_MIB_ITEM(TCP_TAILSKIP_TOHIGH, LINUX_MIB_TCP_TAILSKIP_TOHIGH),
+   SNMP_MIB_ITEM(TCP_NEWSKIP, LINUX_MIB_TCP_NEWSKIP),
+   SNMP_MIB_ITEM(TCP_CACHEREMAINING, LINUX_MIB_TCP_CACHEREMAINING),
SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 9dfdd67..38045c8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1306,6 +1306,10 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff 
*skb, struct sock *sk,
 
tcp_sacktag_one(skb, sk, state, in_sack, dup_sack,
state-fack_count, end_seq);
+
+   NET_INC_STATS_BH(LINUX_MIB_TCP_WALKEDSKBS);
+   if (dup_sack)
+   NET_INC_STATS_BH(LINUX_MIB_TCP_WALKEDDSACKS);
}
return skb;
 }
@@ -1329,6 +1333,7 @@ static struct sk_buff *tcp_sacktag_skip(struct sk_buff 
*skb, struct sock *sk,
skb = tcp_sacktag_walk(skb, sk, state, state-dup_start,
   state-dup_end, 1);
}
+   NET_INC_STATS_BH(LINUX_MIB_TCP_SKIPPEDSKBS);
}
return skb;
 }
@@ -1458,9 +1463,22 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
cache++;
}
 
+   NET_INC_STATS_BH(LINUX_MIB_TCP_SACKTAG);
+   switch (used_sacks) {
+   case 0: NET_INC_STATS_BH(LINUX_MIB_TCP_SACK0); break;
+   case 1: NET_INC_STATS_BH(LINUX_MIB_TCP_SACK1); break;
+   case 2: NET_INC_STATS_BH(LINUX_MIB_TCP_SACK2); break;
+   case 3: NET_INC_STATS_BH(LINUX_MIB_TCP_SACK3); break;
+   case 4: NET_INC_STATS_BH(LINUX_MIB_TCP_SACK4); break;
+   }
+
+   if (!tcp_sack_cache_ok(tp, cache))
+   NET_INC_STATS_BH(LINUX_MIB_TCP_NOCACHE);
+
while (i  used_sacks) {
u32 start_seq = sp[i].start_seq;
u32 end_seq = sp[i].end_seq;
+   int fullwalk = 0;
 
/* Event B in the comment above. */
if (after(end_seq, tp-high_seq))
@@ -1473,41 +1491,69

Re: [PATCH 1/2] David Miller's rbtree patches for 2.6.22.6

2007-09-21 Thread Ilpo Järvinen

On Wed, 19 Sep 2007, Tom Quetchenbach wrote:

 Patch 1: David Miller's red-black tree code, tweaked for 2.6.22.6,
 with some bugfixes

It would help if you would leave the original changes as is (rb-tree and 
fack_count separated) and add your work on top of that...

 diff -ur linux-2.6.22.6/include/net/tcp.h 
 linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h
 --- linux-2.6.22.6/include/net/tcp.h  2007-08-30 23:21:01.0 -0700
 +++ linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h   2007-09-19 
 17:36:07.0 -0700
 @@ -540,6 +540,7 @@
   __u32   seq;/* Starting sequence number */
   __u32   end_seq;/* SEQ + FIN + SYN + datalen*/
   __u32   when;   /* used to compute rtt's*/
 + unsigned intfack_count; /* speed up SACK processing */
   __u8flags;  /* TCP header flags.*/
  
   /* NOTE: These must match up to the flags byte in a
 @@ -1043,12 +1044,12 @@
  }
  
  /*from STCP */
 -static inline void clear_all_retrans_hints(struct tcp_sock *tp){
 +static inline void clear_all_retrans_hints(struct tcp_sock *tp)
 +{

Unrelated change, please don't do that. Besides, it's already fixed in 
net-2.6.24.

   tp-lost_skb_hint = NULL;
   tp-scoreboard_skb_hint = NULL;
   tp-retransmit_skb_hint = NULL;
   tp-forward_skb_hint = NULL;
 - tp-fastpath_skb_hint = NULL;
  }
  
  /* MD5 Signature */
 @@ -1227,9 +1229,61 @@
   sk-sk_send_head = NULL;
  }
  
 +static inline struct sk_buff *tcp_write_queue_find(struct sock *sk, __u32 
 seq)
 +{
 + struct rb_node *rb_node = tcp_sk(sk)-write_queue_rb.rb_node;
 + struct sk_buff *skb = NULL;
 +
 + while (rb_node) {
 + struct sk_buff *tmp = rb_entry(rb_node,struct sk_buff,rb);
 + if (TCP_SKB_CB(tmp)-end_seq  seq) {

This is old and buggy version of the rb-tree code. Get the latest rb-tree 
patch from tcp-2.6 tree.

 + skb = tmp;
 + if (TCP_SKB_CB(tmp)-seq = seq)

...fixed in tcp-2.6.

 + break;
 + rb_node = rb_node-rb_left;
 + } else
 + rb_node = rb_node-rb_right;
 +
 + }
 + return skb;
 +}
 +
 +static inline void tcp_rb_insert(struct sk_buff *skb, struct rb_root *root)
 +{
 + struct rb_node **rb_link, *rb_parent;
 + __u32 seq = TCP_SKB_CB(skb)-seq;
 +
 + rb_link = root-rb_node;
 + rb_parent = NULL;
 + while (*rb_link != NULL) {
 + struct sk_buff *tmp = rb_entry(*rb_link,struct sk_buff,rb);
 + rb_parent = *rb_link;
 + if (TCP_SKB_CB(tmp)-end_seq  seq) {
 + BUG_ON(TCP_SKB_CB(tmp)-seq = seq);

...these are broken as well.


 + rb_link = rb_parent-rb_left;
 + } else {
 + rb_link = rb_parent-rb_right;
 + }
 + }
 + rb_link_node(skb-rb, rb_parent, rb_link);
 + rb_insert_color(skb-rb, root);
 +}
 +
 +static inline void tcp_rb_unlink(struct sk_buff *skb, struct rb_root *root)
 +{
 + rb_erase(skb-rb, root);
 +}
 +
  static inline void __tcp_add_write_queue_tail(struct sock *sk, struct 
 sk_buff *skb)
  {
 + struct sk_buff *tail = tcp_write_queue_tail(sk);
 + unsigned int fc = 0;
 +
 + if (tail)
 + fc = TCP_SKB_CB(tail)-fack_count + tcp_skb_pcount(tail);
 + TCP_SKB_CB(skb)-fack_count = fc;
   __skb_queue_tail(sk-sk_write_queue, skb);
 + tcp_rb_insert(skb, tcp_sk(sk)-write_queue_rb);
  }
  
  static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff 
 *skb)
 diff -ur linux-2.6.22.6/net/ipv4/tcp_input.c 
 linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c
 --- linux-2.6.22.6/net/ipv4/tcp_input.c   2007-08-30 23:21:01.0 
 -0700
 +++ linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c2007-09-13 
 18:23:16.0 -0700
 @@ -947,14 +947,13 @@
   unsigned char *ptr = (skb_transport_header(ack_skb) +
 TCP_SKB_CB(ack_skb)-sacked);
   struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
 - struct sk_buff *cached_skb;
   int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)3;
   int reord = tp-packets_out;
   int prior_fackets;
   u32 lost_retrans = 0;
   int flag = 0;
   int found_dup_sack = 0;
 - int cached_fack_count;
 + int fack_count_base;
   int i;
   int first_sack_index;
  
 @@ -1020,7 +1019,6 @@
   num_sacks = 1;
   else {
   int j;
 - tp-fastpath_skb_hint = NULL;
  
   /* order SACK blocks to allow in order walk of the retrans 
 queue */
   for (i = num_sacks-1; i  0; i--) {
 @@ -1045,14 +1043,7 @@
   /* clear flag as used for different purpose in following code */
   flag = 0;
  
 - /* Use SACK fastpath hint if valid */
 - cached_skb = tp-fastpath_skb_hint;
 -

Re: [PATCH 2/2] David Miller's rbtree patches for 2.6.22.6

2007-09-21 Thread Ilpo Järvinen

On Wed, 19 Sep 2007, Tom Quetchenbach wrote:

 Patch 2: fixes to fack_counts and enhancement of SACK fast path

...Usually these are not combined in patches but a separate patch per 
change.

 diff -ur linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h 
 linux-2.6.22.6-rbtree-tomq/include/net/tcp.h
 --- linux-2.6.22.6-rbtree-davem-fixed/include/net/tcp.h   2007-09-19 
 17:36:07.0 -0700
 +++ linux-2.6.22.6-rbtree-tomq/include/net/tcp.h  2007-09-19 
 12:22:06.0 -0700
 @@ -1213,6 +1213,11 @@

...Btw, please always use -p to diff as well as it helps review :-).

   sk-sk_send_head = tcp_write_queue_next(sk, skb);
   if (sk-sk_send_head == (struct sk_buff *)sk-sk_write_queue)
   sk-sk_send_head = NULL;
 + else
 + /* update fack_count of send_head. Since we've sent skb already,
 +  * its packet count must be set by now. */
 + TCP_SKB_CB(sk-sk_send_head)-fack_count =
 + TCP_SKB_CB(skb)-fack_count + tcp_skb_pcount(skb);

   /* Don't override Nagle indefinately with F-RTO */
   if (tp-frto_counter == 2)
   tp-frto_counter = 3;
 @@ -1310,19 +1315,22 @@
  /* An insert into the middle of the write queue causes the fack
   * counts in subsequent packets to become invalid, fix them up.
   */
 -static inline void tcp_reset_fack_counts(struct sock *sk, struct sk_buff 
 *first)
 +static inline void tcp_reset_fack_counts(struct sock *sk, struct sk_buff 
 *skb)
  {
 - struct sk_buff *prev = first-prev;
 + struct sk_buff *prev = skb-prev;
   unsigned int fc = 0;
  
   if (prev != (struct sk_buff *) sk-sk_write_queue)
   fc = TCP_SKB_CB(prev)-fack_count + tcp_skb_pcount(prev);
  
 - while (first != (struct sk_buff *)sk-sk_write_queue) {
 - TCP_SKB_CB(first)-fack_count = fc;
 + while (skb != (struct sk_buff *)sk-sk_write_queue) {
 + if (TCP_SKB_CB(skb)-fack_count == fc || !tcp_skb_pcount(skb))

What is this !tcp_skb_pcount(skb) for I think what we really want here 
is this:|| (skb == tcp_send_head(sk))

 + break;

 - fc += tcp_skb_pcount(first);
 - first = first-next;
 + TCP_SKB_CB(skb)-fack_count = fc;
 +
 + fc += tcp_skb_pcount(skb);
 + skb = skb-next;
   }
  }
  
 diff -ur linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c 
 linux-2.6.22.6-rbtree-tomq/net/ipv4/tcp_input.c
 --- linux-2.6.22.6-rbtree-davem-fixed/net/ipv4/tcp_input.c2007-09-13 
 18:23:16.0 -0700
 +++ linux-2.6.22.6-rbtree-tomq/net/ipv4/tcp_input.c   2007-09-19 
 12:27:42.0 -0700
 @@ -956,6 +956,7 @@
   int fack_count_base;
   int i;
   int first_sack_index;
 + u32 prev_end_seq = 0;
  
   if (!tp-sacked_out)
   tp-fackets_out = 0;
 @@ -1000,6 +1001,7 @@
   if (i == 0) {
   if (tp-recv_sack_cache[i].start_seq != start_seq)
   flag = 0;
 + prev_end_seq = ntohl(tp-recv_sack_cache[i].end_seq);
   } else {
   if ((tp-recv_sack_cache[i].start_seq != start_seq) ||
   (tp-recv_sack_cache[i].end_seq != end_seq))
 @@ -1016,9 +1018,16 @@
  
   first_sack_index = 0;
   if (flag)
 + /* all that has changed is end of first SACK block. So all we
 +  * need to do is tag those skbs that were'nt tagged last time. 
 */

IMHO, a bit verbose comment. Besides, we already tell above that what SACK 
fastpath is all about...

   num_sacks = 1;
   else {
   int j;
 + 
 + /* more than just end of first SACK block has changed; 
 invalidate
 +  * prev_end_seq */
 +
 + prev_end_seq = 0;

Don't tell what's obvious from code as well.

   /* order SACK blocks to allow in order walk of the retrans 
 queue */
   for (i = num_sacks-1; i  0; i--) {
 @@ -1051,6 +1060,8 @@
   int fack_count;
   int dup_sack = (found_dup_sack  (i == first_sack_index));
  
 + if (prev_end_seq) start_seq = prev_end_seq;
 +

...We never add statements on the same line after if statements (see 
Documentation/CodingStyle)



-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 0/2] David Miller's rbtree patches for 2.6.22.6

2007-09-21 Thread Ilpo Järvinen

On Wed, 19 Sep 2007, Tom Quetchenbach wrote:

 Here are a couple of patches against 2.6.22.6. The first one is just
 David's patches tweaked for 2.6.22.6, with a couple of minor bugfixes to
 get it to compile and not crash.

Why did you combine original patches to a single larger one, I think Dave 
made them separate on purpose.

 (I also changed
 __tcp_insert_write_queue_tail() to set the fack_count of the new packet
 to the fack_count of the tail plus the packet count of the tail, not the
 packet count of the new skb, because I think that's how it was intended
 to be. Right?

I think I noticed similar off-by-pcount error when I looked that long 
time ago, so I guess you're correct. We're only interested in delta 
of it anyway and add the current skb's pcount to it (which is not 
fixed until tcp_fragment in sacktag is past).

 In the second patch there are a couple of significant changes.  One is
 (as Baruch suggested) to modify the existing SACK fast path so that we
 don't tag packets we've already tagged when we advance by a packet.

This solution would still spend extensive amount of time in processing  
loop, whenever recv_sack_cache fast-path is not taken, that is, e.g. when 
cumulative ACK after retransmissions arrive or new hole becomes visible 
(which are not very exceptional events after all :-)). In the cumulative 
ACK case especially, this processing is very likely _fully_ wasted 
walking.

So there is still room for large improvements. I've made an improved 
version of the current sacktag walk couple of days ago (it's in a state 
where it didn't crash but likely very buggy still), I'll post it here 
soon... Idea is embed recv_sack_cache checking fully into the walking 
loop. By doing that an previously known work is not duplicated. The patch
is currently against non-rbtree stuff but incorporating rb-tree things on 
top of it should be very trivial and synergy benefits with rbtree should 
be considerable because non-rbtree has to do fast walk skipping for skbs 
that are under highest_sack which is prone to cache misses. 

 The other issue is that the cached fack_counts seem to be wrong, because
 they're set when we insert into the queue, but tcp_set_tso_segs() is
 called later, just before we send, so all the fack_counts are zero. My
 solution was to set the fack_count when we advance the send_head.

I think it's better solution anyway, since we might have to do 
reset_fack_counts() in between and there's no need to update past 
sk_send_head.

 Also I
 changed tcp_reset_fack_counts() so that it exits when it hits an skb
 whose tcp_skb_pcount() is zero

Do you mind to explain what's the purpose of that?

 or whose fack_count is already correct.
 (This really helps when TSO is on, since there's lots of inserting into
 the middle of the queue.)

Good point.

 Please let me know how I can help get this tested and debugged.

Most network development happens against latest net-2.6(.x) trees. 
In addition, there's experimental tcp-2.6 tree but it's currently a bit 
outdated already (and DaveM is very busy with the phenomenal merge 
they're doing for 2.6.24 :-) so it's not too likely to be updated very 
soon).

...Anyway, thanks for your interest in these things.


-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-2.6.24 0/9]: TCP improvements cleanups

2007-09-20 Thread Ilpo Järvinen

Hi Dave,

Just in case you're short on what to do ;-) here are some TCP
related cleanups  improvements to net-2.6.24. Including FRTO
undo fix which finally should allow FRTO to be turned on, and
some simple fastpath tweaks simple enough to the 2.6.24
schedule. ...I've a larger fastpath_hint removal patch coming
up later too but it's really a monster which needs more time
though I guess it could really cut down the SACK processing
latencies people are experience with high-speed flows (I'll
probably post it with RFC once you've picked these up).

These were boot ( couple of hours) tested on the top of
net-2.6.24 (something after the first large rebase you did,
so you could count that as success report of it too :-)).
Not sure if all those fragment/collapse paths I modified got
executed though.

--
 i.


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/9] [TCP]: Maintain highest_sack accurately to the highest skb

2007-09-20 Thread Ilpo Järvinen

In general, it should not be necessary to call tcp_fragment for
already SACKed skbs, but it's better to be safe than sorry. And
indeed, it can be called from sacktag when a DSACK arrives or
some ACK (with SACK) reordering occurs (sacktag could be made
to avoid the call in the latter case though I'm not sure if it's
worth of the trouble and added complexity to cover such marginal
case).

The collapse case has return for SACKED_ACKED case earlier, so
just WARN_ON if internal inconsistency is detected for some
reason.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_output.c |7 +++
 1 files changed, 7 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index d65d17b..9df5b2a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -692,6 +692,9 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 
len, unsigned int mss
TCP_SKB_CB(buff)-end_seq = TCP_SKB_CB(skb)-end_seq;
TCP_SKB_CB(skb)-end_seq = TCP_SKB_CB(buff)-seq;
 
+   if (tp-sacked_out  (TCP_SKB_CB(skb)-seq == tp-highest_sack))
+   tp-highest_sack = TCP_SKB_CB(buff)-seq;
+
/* PSH and FIN should only be set in the second packet. */
flags = TCP_SKB_CB(skb)-flags;
TCP_SKB_CB(skb)-flags = flags  ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
@@ -1723,6 +1726,10 @@ static void tcp_retrans_try_collapse(struct sock *sk, 
struct sk_buff *skb, int m
/* Update sequence range on original skb. */
TCP_SKB_CB(skb)-end_seq = TCP_SKB_CB(next_skb)-end_seq;
 
+   if (WARN_ON(tp-sacked_out 
+   (TCP_SKB_CB(next_skb)-seq == tp-highest_sack)))
+   return;
+
/* Merge over control information. */
flags |= TCP_SKB_CB(next_skb)-flags; /* This moves PSH/FIN 
etc. over */
TCP_SKB_CB(skb)-flags = flags;
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/9] [TCP]: Make fackets_out accurate

2007-09-20 Thread Ilpo Järvinen

Substraction for fackets_out is unconditional when snd_una
advances, thus there's no need to do it inside the loop. Just
make sure correct bounds are honored.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c  |   10 +++---
 net/ipv4/tcp_output.c |   44 ++--
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fd0ae4d..09b6b1d 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2302,8 +2302,8 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, 
int flag)
 * 1. Reno does not count dupacks (sacked_out) automatically. */
if (!tp-packets_out)
tp-sacked_out = 0;
-   /* 2. SACK counts snd_fack in packets inaccurately. */
-   if (tp-sacked_out == 0)
+
+   if (WARN_ON(!tp-sacked_out  tp-fackets_out))
tp-fackets_out = 0;
 
/* Now state machine starts.
@@ -2571,10 +2571,6 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff 
*skb,
} else if (*seq_rtt  0)
*seq_rtt = now - scb-when;
 
-   if (tp-fackets_out) {
-   __u32 dval = min(tp-fackets_out, packets_acked);
-   tp-fackets_out -= dval;
-   }
tp-packets_out -= packets_acked;
 
BUG_ON(tcp_skb_pcount(skb) == 0);
@@ -2657,7 +2653,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
seq_rtt = now - scb-when;
last_ackt = skb-tstamp;
}
-   tcp_dec_pcount_approx(tp-fackets_out, skb);
tp-packets_out -= tcp_skb_pcount(skb);
tcp_unlink_write_queue(skb, sk);
sk_stream_free_skb(sk, skb);
@@ -2672,6 +2667,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
tcp_ack_update_rtt(sk, acked, seq_rtt);
tcp_rearm_rto(sk);
 
+   tp-fackets_out -= min(pkts_acked, tp-fackets_out);
if (tcp_is_reno(tp))
tcp_remove_reno_sacks(sk, pkts_acked);
 
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 9df5b2a..cbe8bf6 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -652,6 +652,26 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct 
sk_buff *skb, unsigned
}
 }
 
+/* When a modification to fackets out becomes necessary, we need to check
+ * skb is counted to fackets_out or not. Another important thing is to
+ * tweak SACK fastpath hint too as it would overwrite all changes unless
+ * hint is also changed.
+ */
+static void tcp_adjust_fackets_out(struct tcp_sock *tp, struct sk_buff *skb,
+  int decr)
+{
+   if (!tp-sacked_out)
+   return;
+
+   if (!before(tp-highest_sack, TCP_SKB_CB(skb)-seq))
+   tp-fackets_out -= decr;
+
+   /* cnt_hint is off-by-one compared with fackets_out (see sacktag) */
+   if (tp-fastpath_skb_hint != NULL 
+   after(TCP_SKB_CB(tp-fastpath_skb_hint)-seq, TCP_SKB_CB(skb)-seq))
+   tp-fastpath_cnt_hint -= decr;
+}
+
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
  * packet to the list.  This won't be called frequently, I hope.
@@ -746,21 +766,12 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, 
u32 len, unsigned int mss
if (TCP_SKB_CB(skb)-sacked  TCPCB_LOST)
tp-lost_out -= diff;
 
-   if (diff  0) {
-   /* Adjust Reno SACK estimate. */
-   if (tcp_is_reno(tp)) {
-   tcp_dec_pcount_approx_int(tp-sacked_out, 
diff);
-   tcp_verify_left_out(tp);
-   }
-
-   tcp_dec_pcount_approx_int(tp-fackets_out, diff);
-   /* SACK fastpath might overwrite it unless dealt with */
-   if (tp-fastpath_skb_hint != NULL 
-   after(TCP_SKB_CB(tp-fastpath_skb_hint)-seq,
- TCP_SKB_CB(skb)-seq)) {
-   
tcp_dec_pcount_approx_int(tp-fastpath_cnt_hint, diff);
-   }
+   /* Adjust Reno SACK estimate. */
+   if (tcp_is_reno(tp)  diff  0) {
+   tcp_dec_pcount_approx_int(tp-sacked_out, diff);
+   tcp_verify_left_out(tp);
}
+   tcp_adjust_fackets_out(tp, skb, diff);
}
 
/* Link BUFF into the send queue. */
@@ -1746,10 +1757,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, 
struct sk_buff *skb, int m
if (tcp_is_reno(tp)  tp-sacked_out)
tcp_dec_pcount_approx(tp-sacked_out, next_skb

[PATCH 3/9] [TCP]: clear_all_retrans_hints prefixed by tcp_

2007-09-20 Thread Ilpo Järvinen

In addition, fix its function comment spacing.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/net/tcp.h |4 ++--
 net/ipv4/tcp_input.c  |   10 +-
 net/ipv4/tcp_output.c |6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index f28f382..16dfe3c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1066,8 +1066,8 @@ static inline void tcp_mib_init(void)
TCP_ADD_STATS_USER(TCP_MIB_MAXCONN, -1);
 }
 
-/*from STCP */
-static inline void clear_all_retrans_hints(struct tcp_sock *tp){
+/* from STCP */
+static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) {
tp-lost_skb_hint = NULL;
tp-scoreboard_skb_hint = NULL;
tp-retransmit_skb_hint = NULL;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 09b6b1d..89162a9 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1670,7 +1670,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int 
allowed_segments, int flag)
tp-high_seq = tp-frto_highmark;
TCP_ECN_queue_cwr(tp);
 
-   clear_all_retrans_hints(tp);
+   tcp_clear_all_retrans_hints(tp);
 }
 
 void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1741,7 +1741,7 @@ void tcp_enter_loss(struct sock *sk, int how)
/* Abort FRTO algorithm if one is in progress */
tp-frto_counter = 0;
 
-   clear_all_retrans_hints(tp);
+   tcp_clear_all_retrans_hints(tp);
 }
 
 static int tcp_check_sack_reneging(struct sock *sk)
@@ -2106,7 +2106,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
 
/* There is something screwy going on with the retrans hints after
   an undo */
-   clear_all_retrans_hints(tp);
+   tcp_clear_all_retrans_hints(tp);
 }
 
 static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -2199,7 +2199,7 @@ static int tcp_try_undo_loss(struct sock *sk)
TCP_SKB_CB(skb)-sacked = ~TCPCB_LOST;
}
 
-   clear_all_retrans_hints(tp);
+   tcp_clear_all_retrans_hints(tp);
 
DBGUNDO(sk, partial loss);
tp-lost_out = 0;
@@ -2656,7 +2656,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
tp-packets_out -= tcp_skb_pcount(skb);
tcp_unlink_write_queue(skb, sk);
sk_stream_free_skb(sk, skb);
-   clear_all_retrans_hints(tp);
+   tcp_clear_all_retrans_hints(tp);
}
 
if (ackedFLAG_ACKED) {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cbe8bf6..f46d24b 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -687,7 +687,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 
len, unsigned int mss
 
BUG_ON(len  skb-len);
 
-   clear_all_retrans_hints(tp);
+   tcp_clear_all_retrans_hints(tp);
nsize = skb_headlen(skb) - len;
if (nsize  0)
nsize = 0;
@@ -1719,7 +1719,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, 
struct sk_buff *skb, int m
   tcp_skb_pcount(next_skb) != 1);
 
/* changing transmit queue under us so clear hints */
-   clear_all_retrans_hints(tp);
+   tcp_clear_all_retrans_hints(tp);
 
/* Ok.  We will be able to collapse the packet. */
tcp_unlink_write_queue(next_skb, sk);
@@ -1792,7 +1792,7 @@ void tcp_simple_retransmit(struct sock *sk)
}
}
 
-   clear_all_retrans_hints(tp);
+   tcp_clear_all_retrans_hints(tp);
 
if (!lost)
return;
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/9] [TCP]: Move accounting from tso_acked to clean_rtx_queue

2007-09-20 Thread Ilpo Järvinen

The accounting code is pretty much the same, so it's a shame
we do it in two places.

I'm not too sure if added fully_acked check in MTU probing is
really what we want perhaps the added end_seq could be used in
the after() comparison.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |   75 +
 1 files changed, 32 insertions(+), 43 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 89162a9..d340fd5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2528,14 +2528,12 @@ static void tcp_rearm_rto(struct sock *sk)
}
 }
 
-static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
-__u32 now, __s32 *seq_rtt)
+static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
 {
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
__u32 seq = tp-snd_una;
__u32 packets_acked;
-   int acked = 0;
 
/* If we get here, the whole TSO packet has not been
 * acked.
@@ -2548,36 +2546,11 @@ static int tcp_tso_acked(struct sock *sk, struct 
sk_buff *skb,
packets_acked -= tcp_skb_pcount(skb);
 
if (packets_acked) {
-   __u8 sacked = scb-sacked;
-
-   acked |= FLAG_DATA_ACKED;
-   if (sacked) {
-   if (sacked  TCPCB_RETRANS) {
-   if (sacked  TCPCB_SACKED_RETRANS)
-   tp-retrans_out -= packets_acked;
-   acked |= FLAG_RETRANS_DATA_ACKED;
-   *seq_rtt = -1;
-   } else if (*seq_rtt  0)
-   *seq_rtt = now - scb-when;
-   if (sacked  TCPCB_SACKED_ACKED)
-   tp-sacked_out -= packets_acked;
-   if (sacked  TCPCB_LOST)
-   tp-lost_out -= packets_acked;
-   if (sacked  TCPCB_URG) {
-   if (tp-urg_mode 
-   !before(seq, tp-snd_up))
-   tp-urg_mode = 0;
-   }
-   } else if (*seq_rtt  0)
-   *seq_rtt = now - scb-when;
-
-   tp-packets_out -= packets_acked;
-
BUG_ON(tcp_skb_pcount(skb) == 0);
BUG_ON(!before(scb-seq, scb-end_seq));
}
 
-   return acked;
+   return packets_acked;
 }
 
 /* Remove acknowledged frames from the retransmission queue. */
@@ -2587,6 +2560,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
__u32 now = tcp_time_stamp;
+   int fully_acked = 1;
int acked = 0;
int prior_packets = tp-packets_out;
__s32 seq_rtt = -1;
@@ -2595,6 +2569,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
while ((skb = tcp_write_queue_head(sk)) 
   skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
+   u32 end_seq;
+   u32 packets_acked;
__u8 sacked = scb-sacked;
 
/* If our packet is before the ack sequence we can
@@ -2602,11 +2578,19 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
 * the other end.
 */
if (after(scb-end_seq, tp-snd_una)) {
-   if (tcp_skb_pcount(skb)  1 
-   after(tp-snd_una, scb-seq))
-   acked |= tcp_tso_acked(sk, skb,
-  now, seq_rtt);
-   break;
+   if (tcp_skb_pcount(skb) == 1 ||
+   !after(tp-snd_una, scb-seq))
+   break;
+
+   packets_acked = tcp_tso_acked(sk, skb);
+   if (!packets_acked)
+   break;
+
+   fully_acked = 0;
+   end_seq = tp-snd_una;
+   } else {
+   packets_acked = tcp_skb_pcount(skb);
+   end_seq = scb-end_seq;
}
 
/* Initial outgoing SYN's get put onto the write_queue
@@ -2624,7 +2608,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
}
 
/* MTU probing checks */
-   if (icsk-icsk_mtup.probe_size) {
+   if (fully_acked  icsk-icsk_mtup.probe_size) {
if (!after(tp-mtu_probe.probe_seq_end, 
TCP_SKB_CB(skb)-end_seq)) {
tcp_mtup_probe_success(sk, skb);
}
@@ -2633,27 +2617,32 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32

[PATCH 6/9] [TCP] FRTO: Improve interoperability with other undo_marker users

2007-09-20 Thread Ilpo Järvinen

Basically this change enables it, previously other undo_marker
users were left with nothing. Reverse undo_marker logic
completely to get it set right in CA_Loss. On the other hand,
when spurious RTO is detected, clear it. Clearing might be too
heavy for some scenarios but seems safe enough starting point
for now and shouldn't have much effect except in majority of
cases (if in any).

By adding a new FLAG_ we avoid looping through write_queue when
RTO occurs.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |   42 +++---
 1 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 74accb0..948e79a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -104,6 +104,7 @@ int sysctl_tcp_abc __read_mostly;
 #define FLAG_ONLY_ORIG_SACKED  0x200 /* SACKs only non-rexmit sent before RTO 
*/
 #define FLAG_SND_UNA_ADVANCED  0x400 /* Snd_una was changed (!= 
FLAG_DATA_ACKED) */
 #define FLAG_DSACKING_ACK  0x800 /* SACK blocks contained DSACK info */
+#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was 
ACKed */
 
 #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
 #define FLAG_NOT_DUP   (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -1597,6 +1598,8 @@ void tcp_enter_frto(struct sock *sk)
tp-undo_retrans = 0;
 
skb = tcp_write_queue_head(sk);
+   if (TCP_SKB_CB(skb)-sacked  TCPCB_RETRANS)
+   tp-undo_marker = 0;
if (TCP_SKB_CB(skb)-sacked  TCPCB_SACKED_RETRANS) {
TCP_SKB_CB(skb)-sacked = ~TCPCB_SACKED_RETRANS;
tp-retrans_out -= tcp_skb_pcount(skb);
@@ -1646,6 +1649,8 @@ static void tcp_enter_frto_loss(struct sock *sk, int 
allowed_segments, int flag)
/* ...enter this if branch just for the first segment */
flag |= FLAG_DATA_ACKED;
} else {
+   if (TCP_SKB_CB(skb)-sacked  TCPCB_RETRANS)
+   tp-undo_marker = 0;
TCP_SKB_CB(skb)-sacked = 
~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
}
 
@@ -1661,7 +1666,6 @@ static void tcp_enter_frto_loss(struct sock *sk, int 
allowed_segments, int flag)
tp-snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
tp-snd_cwnd_cnt = 0;
tp-snd_cwnd_stamp = tcp_time_stamp;
-   tp-undo_marker = 0;
tp-frto_counter = 0;
 
tp-reordering = min_t(unsigned int, tp-reordering,
@@ -2587,20 +2591,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 
*seq_rtt_p)
end_seq = scb-end_seq;
}
 
-   /* Initial outgoing SYN's get put onto the write_queue
-* just like anything else we transmit.  It is not
-* true data, and if we misinform our callers that
-* this ACK acks real data, we will erroneously exit
-* connection startup slow start one packet too
-* quickly.  This is severely frowned upon behavior.
-*/
-   if (!(scb-flags  TCPCB_FLAG_SYN)) {
-   flag |= FLAG_DATA_ACKED;
-   } else {
-   flag |= FLAG_SYN_ACKED;
-   tp-retrans_stamp = 0;
-   }
-
/* MTU probing checks */
if (fully_acked  icsk-icsk_mtup.probe_size 
!after(tp-mtu_probe.probe_seq_end, scb-end_seq)) {
@@ -2613,6 +2603,9 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 
*seq_rtt_p)
tp-retrans_out -= packets_acked;
flag |= FLAG_RETRANS_DATA_ACKED;
seq_rtt = -1;
+   if ((flag  FLAG_DATA_ACKED) ||
+   (packets_acked  1))
+   flag |= FLAG_NONHEAD_RETRANS_ACKED;
} else if (seq_rtt  0) {
seq_rtt = now - scb-when;
if (fully_acked)
@@ -2634,6 +2627,20 @@ static int tcp_clean_rtx_queue(struct sock *sk, s32 
*seq_rtt_p)
}
tp-packets_out -= packets_acked;
 
+   /* Initial outgoing SYN's get put onto the write_queue
+* just like anything else we transmit.  It is not
+* true data, and if we misinform our callers that
+* this ACK acks real data, we will erroneously exit
+* connection startup slow start one packet too
+* quickly.  This is severely frowned upon behavior.
+*/
+   if (!(scb-flags  TCPCB_FLAG_SYN)) {
+   flag |= FLAG_DATA_ACKED;
+   } else {
+   flag |= FLAG_SYN_ACKED;
+   tp-retrans_stamp = 0

[PATCH 7/9] [TCP] FRTO: Update sysctl documentation

2007-09-20 Thread Ilpo Järvinen

Since the SACK enhanced FRTO was added, the code has been
under test numerous times so remove experimental claim
from the documentation. Also be a bit more verbose about
the usage.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 Documentation/networking/ip-sysctl.txt |   17 -
 1 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.txt 
b/Documentation/networking/ip-sysctl.txt
index 32c2e9d..6ae2fef 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -180,13 +180,20 @@ tcp_fin_timeout - INTEGER
to live longer. Cf. tcp_max_orphans.
 
 tcp_frto - INTEGER
-   Enables F-RTO, an enhanced recovery algorithm for TCP retransmission
+   Enables Forward RTO-Recovery (F-RTO) defined in RFC4138.
+   F-RTO is an enhanced recovery algorithm for TCP retransmission
timeouts.  It is particularly beneficial in wireless environments
where packet loss is typically due to random radio interference
-   rather than intermediate router congestion. If set to 1, basic
-   version is enabled. 2 enables SACK enhanced F-RTO, which is
-   EXPERIMENTAL. The basic version can be used also when SACK is
-   enabled for a flow through tcp_sack sysctl.
+   rather than intermediate router congestion.  FRTO is sender-side
+   only modification.  Therefore it does not require any support from
+   the peer, but in a typical case, however, where wireless link is
+   the local access link and most of the data flows downlink, the
+   faraway servers should have FRTO enabled to take advantage of it.
+   If set to 1, basic version is enabled.  2 enables SACK enhanced
+   F-RTO if flow uses SACK.  The basic version can be used also when
+   SACK is in use though scenario(s) with it exists where FRTO
+   interacts badly with the packet counting of the SACK enabled TCP
+   flow.
 
 tcp_frto_response - INTEGER
When F-RTO has detected that a TCP retransmission timeout was
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/9] [TCP]: Cleanup tcp_tso_acked and tcp_clean_rtx_queue

2007-09-20 Thread Ilpo Järvinen

Implements following cleanups:
- Comment re-placement (CodingStyle)
- tcp_tso_acked() local (wrapper-like) variable removal
  (readability)
- __-types removed (IMHO they make local variables jumpy looking
  and just was space)
- acked - flag (naming conventions elsewhere in TCP code)
- linebreak adjustments (readability)
- nested if()s combined (reduced indentation)
- clarifying newlines added

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |   66 ++---
 1 files changed, 30 insertions(+), 36 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d340fd5..74accb0 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2528,55 +2528,49 @@ static void tcp_rearm_rto(struct sock *sk)
}
 }
 
+/* If we get here, the whole TSO packet has not been acked. */
 static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
 {
struct tcp_sock *tp = tcp_sk(sk);
-   struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-   __u32 seq = tp-snd_una;
-   __u32 packets_acked;
+   u32 packets_acked;
 
-   /* If we get here, the whole TSO packet has not been
-* acked.
-*/
-   BUG_ON(!after(scb-end_seq, seq));
+   BUG_ON(!after(TCP_SKB_CB(skb)-end_seq, tp-snd_una));
 
packets_acked = tcp_skb_pcount(skb);
-   if (tcp_trim_head(sk, skb, seq - scb-seq))
+   if (tcp_trim_head(sk, skb, tp-snd_una - TCP_SKB_CB(skb)-seq))
return 0;
packets_acked -= tcp_skb_pcount(skb);
 
if (packets_acked) {
BUG_ON(tcp_skb_pcount(skb) == 0);
-   BUG_ON(!before(scb-seq, scb-end_seq));
+   BUG_ON(!before(TCP_SKB_CB(skb)-seq, TCP_SKB_CB(skb)-end_seq));
}
 
return packets_acked;
 }
 
-/* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+/* Remove acknowledged frames from the retransmission queue. If our packet
+ * is before the ack sequence we can discard it as it's confirmed to have
+ * arrived at the other end.
+ */
+static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p)
 {
struct tcp_sock *tp = tcp_sk(sk);
const struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb;
-   __u32 now = tcp_time_stamp;
+   u32 now = tcp_time_stamp;
int fully_acked = 1;
-   int acked = 0;
+   int flag = 0;
int prior_packets = tp-packets_out;
-   __s32 seq_rtt = -1;
+   s32 seq_rtt = -1;
ktime_t last_ackt = net_invalid_timestamp();
 
-   while ((skb = tcp_write_queue_head(sk)) 
-  skb != tcp_send_head(sk)) {
+   while ((skb = tcp_write_queue_head(sk))  skb != tcp_send_head(sk)) {
struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
u32 end_seq;
u32 packets_acked;
-   __u8 sacked = scb-sacked;
+   u8 sacked = scb-sacked;
 
-   /* If our packet is before the ack sequence we can
-* discard it as it's confirmed to have arrived at
-* the other end.
-*/
if (after(scb-end_seq, tp-snd_una)) {
if (tcp_skb_pcount(skb) == 1 ||
!after(tp-snd_una, scb-seq))
@@ -2601,38 +2595,38 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
 * quickly.  This is severely frowned upon behavior.
 */
if (!(scb-flags  TCPCB_FLAG_SYN)) {
-   acked |= FLAG_DATA_ACKED;
+   flag |= FLAG_DATA_ACKED;
} else {
-   acked |= FLAG_SYN_ACKED;
+   flag |= FLAG_SYN_ACKED;
tp-retrans_stamp = 0;
}
 
/* MTU probing checks */
-   if (fully_acked  icsk-icsk_mtup.probe_size) {
-   if (!after(tp-mtu_probe.probe_seq_end, 
TCP_SKB_CB(skb)-end_seq)) {
-   tcp_mtup_probe_success(sk, skb);
-   }
+   if (fully_acked  icsk-icsk_mtup.probe_size 
+   !after(tp-mtu_probe.probe_seq_end, scb-end_seq)) {
+   tcp_mtup_probe_success(sk, skb);
}
 
if (sacked) {
if (sacked  TCPCB_RETRANS) {
if (sacked  TCPCB_SACKED_RETRANS)
tp-retrans_out -= packets_acked;
-   acked |= FLAG_RETRANS_DATA_ACKED;
+   flag |= FLAG_RETRANS_DATA_ACKED;
seq_rtt = -1;
} else if (seq_rtt  0) {
seq_rtt = now - scb-when;
if (fully_acked)
last_ackt = skb-tstamp

[PATCH 8/9] [TCP]: Enable SACK enhanced FRTO (RFC4138) by default

2007-09-20 Thread Ilpo Järvinen

Most of the description that follows comes from my mail to
netdev (some editing done):

Main obstacle to FRTO use is its deployment as it has to be on
the sender side where as wireless link is often the receiver's
access link. Take initiative on behalf of unlucky receivers and
enable it by default in future Linux TCP senders. Also IETF
seems to interested in advancing FRTO from experimental [1].

How does FRTO help?
===

FRTO detects spurious RTOs and avoids a number of unnecessary
retransmissions and a couple of other problems that can arise
due to incorrect guess made at RTO (i.e., that segments were
lost when they actually got delayed which is likely to occur
e.g. in wireless environments with link-layer retransmission).
Though FRTO cannot prevent the first (potentially unnecessary)
retransmission at RTO, I suspect that it won't cost that much
even if you have to pay for each bit (won't be that high
percentage out of all packets after all :-)). However, usually
when you have a spurious RTO, not only the first segment
unnecessarily retransmitted but the *whole window*. It goes like
this: all cumulative ACKs got delayed due to in-order delivery,
then TCP will actually send 1.5*original cwnd worth of data in
the RTO's slow-start when the delayed ACKs arrive (basically the
original cwnd worth of it unnecessarily). In case one is
interested in minimizing unnecessary retransmissions e.g. due to
cost, those rexmissions must never see daylight. Besides, in the
worst case the generated burst overloads the bottleneck buffers
which is likely to significantly delay the further progress of
the flow. In case of ll rexmissions, ACK compression often
occurs at the same time making the burst very sharp edged (in
that case TCP often loses most of the segments above high_seq
= very bad performance too). When FRTO is enabled, those
unnecessary retransmissions are fully avoided except for the
first segment and the cwnd behavior after detected spurious RTO
is determined by the response (one can tune that by sysctl).

Basic version (non-SACK enhanced one), FRTO can fail to detect
spurious RTO as spurious and falls back to conservative
behavior. ACK lossage is much less significant than reordering,
usually the FRTO can detect spurious RTO if at least 2
cumulative ACKs from original window are preserved (excluding
the ACK that advances to high_seq). With SACK-enhanced version,
the detection is quite robust.

FRTO should remove the need to set a high lower bound for the
RTO estimator due to delay spikes that occur relatively common
in some environments (esp. in wireless/cellular ones).

[1] http://www1.ietf.org/mail-archive/web/tcpm/current/msg02862.html

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 948e79a..02b549b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -85,7 +85,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 2;
 int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
-int sysctl_tcp_frto __read_mostly;
+int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_frto_response __read_mostly;
 int sysctl_tcp_nometrics_save __read_mostly;
 
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 9/9] [TCP]: Avoid clearing sacktag hint in trivial situations

2007-09-20 Thread Ilpo Järvinen

There's no reason to clear the sacktag skb hint when small part
of the rexmit queue changes. Account changes (if any) instead when
fragmenting/collapsing. RTO/FRTO do not touch SACKED_ACKED bits so
no need to discard SACK tag hint at all.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/net/tcp.h |6 +-
 net/ipv4/tcp_input.c  |   14 --
 net/ipv4/tcp_output.c |   12 
 3 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 16dfe3c..07b1faa 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1067,11 +1067,15 @@ static inline void tcp_mib_init(void)
 }
 
 /* from STCP */
-static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) {
+static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) {
tp-lost_skb_hint = NULL;
tp-scoreboard_skb_hint = NULL;
tp-retransmit_skb_hint = NULL;
tp-forward_skb_hint = NULL;
+}
+
+static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) {
+   tcp_clear_retrans_hints_partial(tp);
tp-fastpath_skb_hint = NULL;
 }
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 02b549b..1092b5a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1674,7 +1674,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int 
allowed_segments, int flag)
tp-high_seq = tp-frto_highmark;
TCP_ECN_queue_cwr(tp);
 
-   tcp_clear_all_retrans_hints(tp);
+   tcp_clear_retrans_hints_partial(tp);
 }
 
 void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1714,10 +1714,14 @@ void tcp_enter_loss(struct sock *sk, int how)
tp-bytes_acked = 0;
tcp_clear_retrans(tp);
 
-   /* Push undo marker, if it was plain RTO and nothing
-* was retransmitted. */
-   if (!how)
+   if (!how) {
+   /* Push undo marker, if it was plain RTO and nothing
+* was retransmitted. */
tp-undo_marker = tp-snd_una;
+   tcp_clear_retrans_hints_partial(tp);
+   } else {
+   tcp_clear_all_retrans_hints(tp);
+   }
 
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
@@ -1744,8 +1748,6 @@ void tcp_enter_loss(struct sock *sk, int how)
TCP_ECN_queue_cwr(tp);
/* Abort FRTO algorithm if one is in progress */
tp-frto_counter = 0;
-
-   tcp_clear_all_retrans_hints(tp);
 }
 
 static int tcp_check_sack_reneging(struct sock *sk)
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index f46d24b..cbb83ac 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -687,7 +687,7 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 
len, unsigned int mss
 
BUG_ON(len  skb-len);
 
-   tcp_clear_all_retrans_hints(tp);
+   tcp_clear_retrans_hints_partial(tp);
nsize = skb_headlen(skb) - len;
if (nsize  0)
nsize = 0;
@@ -1718,9 +1718,6 @@ static void tcp_retrans_try_collapse(struct sock *sk, 
struct sk_buff *skb, int m
BUG_ON(tcp_skb_pcount(skb) != 1 ||
   tcp_skb_pcount(next_skb) != 1);
 
-   /* changing transmit queue under us so clear hints */
-   tcp_clear_all_retrans_hints(tp);
-
/* Ok.  We will be able to collapse the packet. */
tcp_unlink_write_queue(next_skb, sk);
 
@@ -1759,6 +1756,13 @@ static void tcp_retrans_try_collapse(struct sock *sk, 
struct sk_buff *skb, int m
 
tcp_adjust_fackets_out(tp, skb, tcp_skb_pcount(next_skb));
tp-packets_out -= tcp_skb_pcount(next_skb);
+
+   /* changed transmit queue under us so clear hints */
+   tcp_clear_retrans_hints_partial(tp);
+   /* manually tune sacktag skb hint */
+   if (tp-fastpath_skb_hint == next_skb)
+   tp-fastpath_skb_hint = skb;
+
sk_stream_free_skb(sk, next_skb);
}
 }
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH - net-2.6.24 0/2] Introduce and use print_ip and print_ipv6

2007-09-20 Thread Ilpo Järvinen

On Thu, 20 Sep 2007, Joe Perches wrote:

 On Thu, 2007-09-20 at 07:55 -0700, Randy Dunlap wrote:
  How large are the patches if you posted them for review instead
  of just referencing gits for them?  (which cuts down on review
  possibilities)
 
 The v4 is ~130kb, the v6 ~35kb.

 There is a gitweb available at:
 
 print_ip:
 http://repo.or.cz/w/linux-2.6/trivial-mods.git?a=shortlog;h=print_ipv4
 commit diff:
 http://repo.or.cz/w/linux-2.6/trivial-mods.git?a=commitdiff;h=1e3a30d5d8b49b3accca07cc84ecf6d977cacdd5
 
 print_ipv6:
 http://repo.or.cz/w/linux-2.6/trivial-mods.git?a=shortlog;h=print_ipv6
 commit diff:
 http://repo.or.cz/w/linux-2.6/trivial-mods.git?a=commitdiff;h=e96b794a57a164db84379e2baf5fe2622a5ae3bf

...Alternatively you could split it up a bit and send those smaller 
chunks for reviewing purposes only (even though it would be combined
to a single big patch in the end).

-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 9/9] [TCP]: Avoid clearing sacktag hint in trivial situations

2007-09-20 Thread Ilpo Järvinen

On Thu, 20 Sep 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Thu, 20 Sep 2007 15:17:52 +0300

  There's no reason to clear the sacktag skb hint when small part
  of the rexmit queue changes. Account changes (if any) instead when
  fragmenting/collapsing. RTO/FRTO do not touch SACKED_ACKED bits so
  no need to discard SACK tag hint at all.

  Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]

 Applied, and I followed it up with this coding style fixlet.

Yeah, that's for fixing it... ...Just didn't notice it was left wrong 
while doing things that required more thinking to get them right...

-- 
 i.

[PATCH netdev-2.6/ALL] DIV_ROUND_UP/roundup cleanup (under drivers/net/)

2007-09-11 Thread Ilpo Järvinen


Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 drivers/net/arcnet/rfc1201.c|3 ++-
 drivers/net/atl1/atl1_main.c|   17 +++--
 drivers/net/gianfar_ethtool.c   |2 +-
 drivers/net/ibm_emac/ibm_emac_mal.h |3 ++-
 drivers/net/mlx4/alloc.c|3 ++-
 drivers/net/mlx4/icm.c  |3 ++-
 drivers/net/s2io.c  |   11 ---
 drivers/net/sb1250-mac.c|2 +-
 drivers/net/starfire.c  |6 +++---
 drivers/net/wan/sbni.c  |2 +-
 drivers/net/xen-netfront.c  |2 +-
 11 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/drivers/net/arcnet/rfc1201.c b/drivers/net/arcnet/rfc1201.c
index 460a095..126ee5b 100644
--- a/drivers/net/arcnet/rfc1201.c
+++ b/drivers/net/arcnet/rfc1201.c
@@ -23,6 +23,7 @@
  *
  * **
  */
+#include linux/kernel.h
 #include linux/module.h
 #include linux/init.h
 #include linux/if_arp.h
@@ -493,7 +494,7 @@ static int prepare_tx(struct net_device *dev, struct archdr 
*pkt, int length,
 
out-length = length - RFC1201_HDR_SIZE;
out-dataleft = lp-outgoing.length;
-   out-numsegs = (out-dataleft + maxsegsize - 1) / maxsegsize;
+   out-numsegs = DIV_ROUND_UP(out-dataleft, maxsegsize);
out-segnum = 0;
 
BUGMSG(D_DURING, rfc1201 prep_tx: ready for %d-segment split 
diff --git a/drivers/net/atl1/atl1_main.c b/drivers/net/atl1/atl1_main.c
index c8deefa..b198f70 100644
--- a/drivers/net/atl1/atl1_main.c
+++ b/drivers/net/atl1/atl1_main.c
@@ -51,6 +51,7 @@
  */
 
 #include linux/types.h
+#include linux/kernel.h
 #include linux/netdevice.h
 #include linux/pci.h
 #include linux/spinlock.h
@@ -1132,8 +1133,7 @@ static void atl1_update_rfd_index(struct atl1_adapter 
*adapter,
 {
u16 num_buf;
 
-   num_buf = (rrd-xsz.xsum_sz.pkt_size + adapter-rx_buffer_len - 1) /
-   adapter-rx_buffer_len;
+   num_buf = DIV_ROUND_UP(rrd-xsz.xsum_sz.pkt_size, 
adapter-rx_buffer_len);
if (rrd-num_buf == num_buf)
/* clean alloc flag for bad rrd */
atl1_clean_alloc_flag(adapter, rrd, num_buf);
@@ -1537,8 +1537,7 @@ static void atl1_tx_map(struct atl1_adapter *adapter, 
struct sk_buff *skb,
 
if (first_buf_len  proto_hdr_len) {
len12 = first_buf_len - proto_hdr_len;
-   m = (len12 + ATL1_MAX_TX_BUF_LEN - 1) /
-   ATL1_MAX_TX_BUF_LEN;
+   m = DIV_ROUND_UP(len12, ATL1_MAX_TX_BUF_LEN);
for (i = 0; i  m; i++) {
buffer_info =
tpd_ring-buffer_info[tpd_next_to_use];
@@ -1578,7 +1577,7 @@ static void atl1_tx_map(struct atl1_adapter *adapter, 
struct sk_buff *skb,
frag = skb_shinfo(skb)-frags[f];
lenf = frag-size;
 
-   m = (lenf + ATL1_MAX_TX_BUF_LEN - 1) / ATL1_MAX_TX_BUF_LEN;
+   m = DIV_ROUND_UP(lenf, ATL1_MAX_TX_BUF_LEN);
for (i = 0; i  m; i++) {
buffer_info = tpd_ring-buffer_info[tpd_next_to_use];
if (unlikely(buffer_info-skb))
@@ -1681,8 +1680,7 @@ static int atl1_xmit_frame(struct sk_buff *skb, struct 
net_device *netdev)
for (f = 0; f  nr_frags; f++) {
frag_size = skb_shinfo(skb)-frags[f].size;
if (frag_size)
-   count += (frag_size + ATL1_MAX_TX_BUF_LEN - 1) /
-   ATL1_MAX_TX_BUF_LEN;
+   count += DIV_ROUND_UP(frag_size, ATL1_MAX_TX_BUF_LEN);
}
 
/* mss will be nonzero if we're doing segment offload (TSO/GSO) */
@@ -1697,9 +1695,8 @@ static int atl1_xmit_frame(struct sk_buff *skb, struct 
net_device *netdev)
}
/* need additional TPD ? */
if (proto_hdr_len != len)
-   count += (len - proto_hdr_len +
-   ATL1_MAX_TX_BUF_LEN - 1) /
-   ATL1_MAX_TX_BUF_LEN;
+   count += DIV_ROUND_UP(len - proto_hdr_len,
+ ATL1_MAX_TX_BUF_LEN);
}
}
 
diff --git a/drivers/net/gianfar_ethtool.c b/drivers/net/gianfar_ethtool.c
index 2470903..8352754 100644
--- a/drivers/net/gianfar_ethtool.c
+++ b/drivers/net/gianfar_ethtool.c
@@ -245,7 +245,7 @@ static unsigned int gfar_usecs2ticks(struct gfar_private 
*priv, unsigned int use
 
/* Make sure we return a number greater than 0
 * if usecs  0 */
-   return ((usecs * 1000 + count - 1) / count);
+   return DIV_ROUND_UP(usecs * 1000, count);
 }
 
 /* Convert ethernet clock ticks to microseconds */
diff --git a/drivers/net/ibm_emac/ibm_emac_mal.h 
b/drivers/net

Re: [PATCH] make _minimum_ TCP retransmission timeout configurable

2007-09-05 Thread Ilpo Järvinen

On Wed, 29 Aug 2007, David Miller wrote:

 From: Rick Jones [EMAIL PROTECTED]
 Date: Wed, 29 Aug 2007 16:06:27 -0700

  I belive the biggest component comes from link-layer retransmissions. 
  There can also be some short outtages thanks to signal blocking, 
  tunnels, people with big hats and whatnot that the link-layer 
  retransmissions are trying to address.  The three seconds seems to be a 
  value that gives the certainty that 99 times out of 10 the segment was 
  indeed lost.

  The trace I've been sent shows clean RTTs ranging from ~200 milliseconds 
  to ~7000 milliseconds.

 Thanks for the info.

 It's pretty easy to generate examples where we might have some sockets
 talking over interfaces on such a network and others which are not.
 Therefore, if we do this, a per-route metric is probably the best bet.

 Ilpo, I'm also very interested to see what you think of all of this :-)

...Haven't been too actively reading mails for a while until now, so I'm a 
bit late in response... I'll try to quickly summarize FRTO here.

It's true that FRTO cannot prevent the first retransmission, yet I suspect 
that it won't cost that much even if you have to pay for each bit, won't 
be that high percentage out of all packets after all :-). However, usually 
when you have a spurious RTO, not only the first segment unnecessarily 
retransmitted but the *whole window*. It goes like this: all cumulative 
ACKs got delayed due to in-order delivery, then TCP will actually send
1.5*original cwnd worth of data in the RTO's slow-start when the delayed 
ACKs arrive (basically the original cwnd worth of it unnecessarily). In 
case one is interested in minimizing unnecessary retransmissions e.g. due 
to cost, those rexmissions must never see daylight. Besides, in the worst 
case the generated burst overloads the bottleneck buffers which is likely 
to significantly delay the further progress of the flow. In case of ll 
rexmissions, ACK compression often occurs at the same time making the 
burst very sharp edged (in that case TCP often loses most of the 
segments above high_seq = very bad performance too). When FRTO is 
enabled, those unnecessary retransmissions are fully avoided except for 
the first segment and the cwnd behavior after detected spurious RTO is 
determined by the response (one can tune that by sysctl). Basic version 
(non-SACK enhanced one), FRTO can fail to detect spurious RTO as spurious 
and falls back to conservative behavior. ACK lossage is much less 
significant than reordering, usually the FRTO can detect spurious RTO if 
at least 2 cumulative ACKs from original window are preserved (excluding 
the ACK that advances to high_seq). With SACK-enhanced version, the 
detection is quite robust. Of course one could jump to min_rto bandwagon 
instead, but it often ends up being more or less black magic and can still 
produce unwanted behavior unless one goes to ridicilously high minimum RTOs.

Main obstacle to FRTO use is its deployment as it has to be on the sender 
side where as wireless link is often the receiver's access link but if one 
can tune tcp_min_rto (or equal) on the sender side, one could enable
FRTO at will as well. Anyway, anything older than 2.6.22 is not going to 
give very good results with FRTO. FRTO code's maturity point of view, IMHO 
currently just unconditional clearing of undo_marker (in 
tcp_enter_frto_loss) is on the way of enabling FRTO in future kernels by 
default as it basically disables DSACK undoing, I'll try to solve that 
soon, has been on my todo list for too long already (don't currently have 
much time to devote to that though so 2.6.24-rc1 might come too early for 
me :-(). After that, it might be a good move to enable it in mainline by 
default if you agree... ...Uninteresting enough, even IETF seems to 
interested in advancing FRTO from experimental [1].

Another important thing to consider in cellular besides ll rexmissions is 
bandwidth allocation delay... We actually a week ago ran some measurements 
in a real umts network to determine buffer, one-way delay, etc. behavior 
(though YMMV depending on operators configuration etc.). Basically we saw 
1 s delay spike when allocation delay occurs (it's very hard to predict 
when that happens due to other network users role). One-way propagation 
delay was around 50 ms, so 1500 bytes takes about 80 ms+ to transmit, so 
it's it order of magnitude larger than RTT but queue delay is probably 
large enough to prevent spurious RTOs due to allocation delay. Besides 
that, we saw some long latencies, up to 8-12 s, they could be due to ll 
retransmissions but their source is not yet verified to be the WWAN link 
as we had the phone connected through bluetooth (could interfere). A funny 
sidenote about the experiment, we found out what Linux cannot do (from 
userspace only): it seems to be unable to receive the same packet it has 
sent out to itself as we forced the packet out from eth0 by binding 
sending dev to eth0 and

[PATCH net-2.6.24] [NET]: DIV_ROUND_UP cleanup (part two)

2007-08-28 Thread Ilpo Järvinen


Hopefully captured all single statement cases under net/. I'm
not too sure if there is some policy about #includes that are
guaranteed (ie., in the current tree) to be available through
some other #included header, so I just added linux/kernel.h to
each changed file that didn't #include it previously.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/net/sock.h   |3 ++-
 net/bridge/br_stp_bpdu.c |2 +-
 net/dccp/ackvec.c|5 ++---
 net/ieee80211/ieee80211_crypt_ccmp.c |5 +++--
 net/ipv4/inet_diag.c |5 +++--
 net/ipv4/inet_timewait_sock.c|4 ++--
 net/ipv4/tcp.c   |3 ++-
 net/mac80211/aes_ccm.c   |5 +++--
 net/mac80211/tx.c|2 +-
 net/sunrpc/svcsock.c |3 ++-
 10 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index dfeb8b1..802c670 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -40,6 +40,7 @@
 #ifndef _SOCK_H
 #define _SOCK_H
 
+#include linux/kernel.h
 #include linux/list.h
 #include linux/timer.h
 #include linux/cache.h
@@ -702,7 +703,7 @@ extern int sk_stream_mem_schedule(struct sock *sk, int 
size, int kind);
 
 static inline int sk_stream_pages(int amt)
 {
-   return (amt + SK_STREAM_MEM_QUANTUM - 1) / SK_STREAM_MEM_QUANTUM;
+   return DIV_ROUND_UP(amt, SK_STREAM_MEM_QUANTUM);
 }
 
 static inline void sk_stream_mem_reclaim(struct sock *sk)
diff --git a/net/bridge/br_stp_bpdu.c b/net/bridge/br_stp_bpdu.c
index 60112bc..14f0c88 100644
--- a/net/bridge/br_stp_bpdu.c
+++ b/net/bridge/br_stp_bpdu.c
@@ -64,7 +64,7 @@ static inline int br_get_ticks(const unsigned char *src)
 {
unsigned long ticks = ntohs(get_unaligned((__be16 *)src));
 
-   return (ticks * HZ + STP_HZ - 1) / STP_HZ;
+   return DIV_ROUND_UP(ticks * HZ, STP_HZ);
 }
 
 /* called under bridge lock */
diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c
index 3f8984b..83378f3 100644
--- a/net/dccp/ackvec.c
+++ b/net/dccp/ackvec.c
@@ -69,9 +69,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff 
*skb)
struct dccp_sock *dp = dccp_sk(sk);
struct dccp_ackvec *av = dp-dccps_hc_rx_ackvec;
/* Figure out how many options do we need to represent the ackvec */
-   const u16 nr_opts = (av-dccpav_vec_len +
-DCCP_MAX_ACKVEC_OPT_LEN - 1) /
-   DCCP_MAX_ACKVEC_OPT_LEN;
+   const u16 nr_opts = DIV_ROUND_UP(av-dccpav_vec_len,
+DCCP_MAX_ACKVEC_OPT_LEN);
u16 len = av-dccpav_vec_len + 2 * nr_opts, i;
u32 elapsed_time;
const unsigned char *tail, *from;
diff --git a/net/ieee80211/ieee80211_crypt_ccmp.c 
b/net/ieee80211/ieee80211_crypt_ccmp.c
index b016b41..2e6b099 100644
--- a/net/ieee80211/ieee80211_crypt_ccmp.c
+++ b/net/ieee80211/ieee80211_crypt_ccmp.c
@@ -9,6 +9,7 @@
  * more details.
  */
 
+#include linux/kernel.h
 #include linux/err.h
 #include linux/module.h
 #include linux/init.h
@@ -241,7 +242,7 @@ static int ieee80211_ccmp_encrypt(struct sk_buff *skb, int 
hdr_len, void *priv)
hdr = (struct ieee80211_hdr_4addr *)skb-data;
ccmp_init_blocks(key-tfm, hdr, key-tx_pn, data_len, b0, b, s0);
 
-   blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN;
+   blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN);
last = data_len % AES_BLOCK_LEN;
 
for (i = 1; i = blocks; i++) {
@@ -351,7 +352,7 @@ static int ieee80211_ccmp_decrypt(struct sk_buff *skb, int 
hdr_len, void *priv)
ccmp_init_blocks(key-tfm, hdr, pn, data_len, b0, a, b);
xor_block(mic, b, CCMP_MIC_LEN);
 
-   blocks = (data_len + AES_BLOCK_LEN - 1) / AES_BLOCK_LEN;
+   blocks = DIV_ROUND_UP(data_len, AES_BLOCK_LEN);
last = data_len % AES_BLOCK_LEN;
 
for (i = 1; i = blocks; i++) {
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index dbeacd8..06922da 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -11,6 +11,7 @@
  *  2 of the License, or (at your option) any later version.
  */
 
+#include linux/kernel.h
 #include linux/module.h
 #include linux/types.h
 #include linux/fcntl.h
@@ -112,7 +113,7 @@ static int inet_csk_diag_fill(struct sock *sk,
}
 #endif
 
-#define EXPIRES_IN_MS(tmo)  ((tmo - jiffies) * 1000 + HZ - 1) / HZ
+#define EXPIRES_IN_MS(tmo)  DIV_ROUND_UP((tmo - jiffies) * 1000, HZ)
 
if (icsk-icsk_pending == ICSK_TIME_RETRANS) {
r-idiag_timer = 1;
@@ -190,7 +191,7 @@ static int inet_twsk_diag_fill(struct inet_timewait_sock 
*tw,
r-id.idiag_dst[0]= tw-tw_daddr;
r-idiag_state= tw-tw_substate;
r-idiag_timer= 3;
-   r-idiag_expires  = (tmo * 1000 + HZ - 1) / HZ;
+   r-idiag_expires  = DIV_ROUND_UP(tmo * 1000, HZ);
r-idiag_rqueue   = 0;
r-idiag_wqueue   = 0

net-2.6.24 build broken (allyesconfig)

2007-08-27 Thread Ilpo Järvinen

Hi,

$ git-reset --hard net-2.6.24-origin
HEAD is now at e2eb35e... [ATM]: Fix build errors after conversion to pr_debug()
$ make allyesconfig
...
$ make bzImage
...
  CC  drivers/net/82596.o
drivers/net/82596.c:1618:1: error: unterminated argument list invoking 
macro DEB
drivers/net/82596.c: In function 'set_multicast_list':
drivers/net/82596.c:1564: error: 'DEB' undeclared (first use in this 
function)
drivers/net/82596.c:1564: error: (Each undeclared identifier is reported 
only once
drivers/net/82596.c:1564: error: for each function it appears in.)
drivers/net/82596.c:1564: error: expected ';' at end of input
make[1]: *** [drivers/net/82596.o] Error 1
make: *** [drivers/net/82596.o] Error 2


Hmm, I would guess that [NET]: Introduce MAC_FMT/MAC_ARG broken it, 
though didn't verify it.

The fix is left as an exercise of the reader (i.e., the solution wasn't 
too obvious for me :-) )...


--
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 4/5] [TCP]: Discard fuzzy SACK blocks

2007-08-25 Thread Ilpo Järvinen

On Fri, 24 Aug 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Mon, 20 Aug 2007 16:16:32 +0300

  SACK processing code has been a sort of russian roulette as no
  validation of SACK blocks is previously attempted. Besides, it
  is not very clear what all kinds of broken SACK blocks really
  mean (e.g., one that has start and end sequence numbers
  reversed). So now close the roulette once and for all.

  Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]

 Thanks a lot for coding this up, I like it a lot, applied.

 I have some minor worries about the D-SACK lower bound, but
 it's probably OK and I'm just being paranoid :-)

...Please tell what is your concern rather than hinting :-), or is it 
just a hunch?...

Elsewhere we do a similar checking anyway (I might eventually end up
dropping this check in dsack as duplicate due to other planned changes 
but it's necessary still as the validation is being done in the mainloop 
after check_dsack call):

/* D-SACK for already forgotten data... Do dumb counting. */
if (dup_sack 
!after(end_seq_0, prior_snd_una) 
after(end_seq_0, tp-undo_marker))
tp-undo_retrans--;

...It's natural that due to HW duplication that we get DSACK below 
undo_marker when state = CA_CWR. In general, they almost always mean 
exactly that, a HW duplication, so instead IMHO we should account them as 
DSACK lying bit in sack_ok and disable DSACK undos for that flow (similar 
case is !EVER_RETRANS skb gets DSACKed). In theory, it could be delayed 
rexmission or something but we've already lost our state already and 
cannot verify that, so choosing the conservative dsack-lying bit there 
seems fine and should even be right thing for majority of the cases 
anyway. I was planning to do something along those lines later...

The key problem here was (in the previous version that was in tcp-2.6), 
that the whole validation business becomes rather useless, if any invalid 
SACK blocks (those that really aren't DSACK but due to bug, malicious or 
whatever reason) below snd_una gets accepted as DSACK, that's basically 
the thing I wanted to avoid as it will be significant for the half of 
the seqno space... Now there's hopefully a bit smaller window for such
garbage... :-)

Key exits from tcp_is_sackblock_valid reside before those checks anyway, 
so they shouldn't be that problematic in performance wise either. ...I was 
thinking of adding unlikely to the latter checks but wasn't too sure if 
that's wise thing to do as malicious entity could push TCP to do them 
(basically at will), Comments?

One additional note: the mainloop operates anyway only in the seqno range 
above prior_snd_una (earlier skbs already being dropped), that can't ever 
be  undo_marker (so some of the DSACK checks are not yet strictly 
necessary but I wanted to do things right from the beginning as I might 
end up re-placing validation soo ;-)). ...So our discussion currently 
probably covers seqno range that is not going to have any significance at 
all... :-) 

Maybe my Too old comment deserves some additional explanation:

[PATCH] [TCP]: More verbose comment to DSACK validation

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |5 -
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 8692d0b..cd187c6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1070,7 +1070,10 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, 
int is_dsack,
if (!before(start_seq, tp-undo_marker))
return 1;

-   /* Too old */
+   /* Too old (it no longer has any significance to TCP state though
+* it can be valid; for more complete explanation see comment above
+* and similar validation done in tcp_check_dsack())
+*/
if (!after(end_seq, tp-undo_marker))
return 0;

-- 
1.5.0.6

[PATCH net-2.6.24] [NET] Cleanup: DIV_ROUND_UP

2007-08-22 Thread Ilpo Järvinen


Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_output.c |6 +-
 net/key/af_key.c  |   17 +
 2 files changed, 6 insertions(+), 17 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 10b2e39..bca4ee2 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -646,11 +646,7 @@ static void tcp_set_skb_tso_segs(struct sock *sk, struct 
sk_buff *skb, unsigned
skb_shinfo(skb)-gso_size = 0;
skb_shinfo(skb)-gso_type = 0;
} else {
-   unsigned int factor;
-
-   factor = skb-len + (mss_now - 1);
-   factor /= mss_now;
-   skb_shinfo(skb)-gso_segs = factor;
+   skb_shinfo(skb)-gso_segs = DIV_ROUND_UP(skb-len, mss_now);
skb_shinfo(skb)-gso_size = mss_now;
skb_shinfo(skb)-gso_type = sk-sk_gso_type;
}
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 5502df1..17b2a69 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -352,16 +352,14 @@ static int verify_address_len(void *p)
 
switch (addr-sa_family) {
case AF_INET:
-   len  = sizeof(*sp) + sizeof(*sin) + (sizeof(uint64_t) - 1);
-   len /= sizeof(uint64_t);
+   len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin), 
sizeof(uint64_t));
if (sp-sadb_address_len != len ||
sp-sadb_address_prefixlen  32)
return -EINVAL;
break;
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
case AF_INET6:
-   len  = sizeof(*sp) + sizeof(*sin6) + (sizeof(uint64_t) - 1);
-   len /= sizeof(uint64_t);
+   len = DIV_ROUND_UP(sizeof(*sp) + sizeof(*sin6), 
sizeof(uint64_t));
if (sp-sadb_address_len != len ||
sp-sadb_address_prefixlen  128)
return -EINVAL;
@@ -386,14 +384,9 @@ static int verify_address_len(void *p)
 
 static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx)
 {
-   int len = 0;
-
-   len += sizeof(struct sadb_x_sec_ctx);
-   len += sec_ctx-sadb_x_ctx_len;
-   len += sizeof(uint64_t) - 1;
-   len /= sizeof(uint64_t);
-
-   return len;
+   return DIV_ROUND_UP(sizeof(struct sadb_x_sec_ctx) +
+   sec_ctx-sadb_x_ctx_len,
+   sizeof(uint64_t));
 }
 
 static inline int verify_sec_ctx_len(void *p)
-- 
1.5.0.6

[PATCH net-2.6.24 0/5] TCP: Cleanups SACK block validation

2007-08-20 Thread Ilpo Järvinen

Hi Dave,

Here are couple of patches to net-2.6.24. The first three are trivial
cleanups. The idea to the last two comes from tcp-2.6 but the validator
has been heavily modified (and hopefully improved in the process :-)).
I'm not sure though if checking for the undo_marker boundary crossing
case is a bit over-engineering (inherited from the original version
which already checked for that case). In addition, better names could
be invented for MIBs, suggestions?

--
 i.


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 4/5] [TCP]: Discard fuzzy SACK blocks

2007-08-20 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

SACK processing code has been a sort of russian roulette as no
validation of SACK blocks is previously attempted. Besides, it
is not very clear what all kinds of broken SACK blocks really
mean (e.g., one that has start and end sequence numbers
reversed). So now close the roulette once and for all.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |   82 ++
 1 files changed, 82 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 2bf3d57..102aefa 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1001,7 +1001,86 @@ static void tcp_update_reordering(struct sock *sk, const 
int metric,
  *for retransmitted and already SACKed segment - reordering..
  * Both of these heuristics are not used in Loss state, when we cannot
  * account for retransmits accurately.
+ *
+ * SACK block validation.
+ * --
+ *
+ * SACK block range validation checks that the received SACK block fits to
+ * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
+ * Note that SND.UNA is not included to the range though being valid because
+ * it means that the receiver is rather inconsistent with itself (reports
+ * SACK reneging when it should advance SND.UNA).
+ *
+ * Implements also blockage to start_seq wrap-around. Problem lies in the
+ * fact that though start_seq (s) is before end_seq (i.e., not reversed),
+ * there's no guarantee that it will be before snd_nxt (n). The problem
+ * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
+ * wrap (s_w):
+ *
+ * - outs wnd -  - wrapzone -
+ * u e  n u_w   e_w  s n_w
+ * | |  |  | |   |  |
+ * |+--+- TCP seqno space --+--|
+ * ...-- 2^31 -|   |...
+ * ... 2^31 --||...
+ *
+ * Current code wouldn't be vulnerable but it's better still to discard such
+ * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
+ * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
+ * snd_nxt wrap - snd_una region will then become well defined, i.e.,
+ * equal to the ideal case (infinite seqno space without wrap caused issues).
+ *
+ * With D-SACK the lower bound is extended to cover sequence space below
+ * SND.UNA down to undo_marker, which is the last point of interest. Yet
+ * again, DSACK block must not to go across snd_una (for the same reason as
+ * for the normal SACK blocks, explained above). But there all simplicity
+ * ends, TCP might receive valid D-SACKs below that. As long as they reside
+ * fully below undo_marker they do not affect behavior in anyway and can
+ * therefore be safely ignored. In rare cases (which are more or less
+ * theoretical ones), the D-SACK will nicely cross that boundary due to skb
+ * fragmentation and packet reordering past skb's retransmission. To consider
+ * them correctly, the acceptable range must be extended even more though
+ * the exact amount is rather hard to quantify. However, tp-max_window can
+ * be used as an exaggerated estimate.
  */
+static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
+ u32 start_seq, u32 end_seq)
+{
+   /* Too far in future, or reversed (interpretation is ambiguous) */
+   if (after(end_seq, tp-snd_nxt) || !before(start_seq, end_seq))
+   return 0;
+
+   /* Nasty start_seq wrap-around check (see comments above) */
+   if (!before(start_seq, tp-snd_nxt))
+   return 0;
+
+   /* In outstanding window? ...This is valid exit for DSACKs too.
+* start_seq == snd_una is non-sensical (see comments above)
+*/
+   if (after(start_seq, tp-snd_una))
+   return 1;
+
+   if (!is_dsack || !tp-undo_marker)
+   return 0;
+
+   /* ...Then it's D-SACK, and must reside below snd_una completely */
+   if (!after(end_seq, tp-snd_una))
+   return 0;
+
+   if (!before(start_seq, tp-undo_marker))
+   return 1;
+
+   /* Too old */
+   if (!after(end_seq, tp-undo_marker))
+   return 0;
+
+   /* Undo_marker boundary crossing (overestimates a lot). Known already:
+*   start_seq  undo_marker and end_seq = undo_marker.
+*/
+   return !before(start_seq, end_seq - tp-max_window);
+}
+
+
 static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
   struct tcp_sack_block_wire *sp, int num_sacks,
   u32 prior_snd_una)
@@ -1143,6 +1222,9 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
int fack_count

[PATCH 2/5] [TCP]: tcp_packets_out_inc to tcp_output.c (no callers elsewhere)

2007-08-20 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/net/tcp.h |   12 
 net/ipv4/tcp_output.c |   12 
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6d586ca..f28f382 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -614,18 +614,6 @@ static inline void tcp_dec_pcount_approx(__u32 *count,
tcp_dec_pcount_approx_int(count, tcp_skb_pcount(skb));
 }
 
-static inline void tcp_packets_out_inc(struct sock *sk,
-  const struct sk_buff *skb)
-{
-   struct tcp_sock *tp = tcp_sk(sk);
-   int orig = tp-packets_out;
-
-   tp-packets_out += tcp_skb_pcount(skb);
-   if (!orig)
-   inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
- inet_csk(sk)-icsk_rto, TCP_RTO_MAX);
-}
-
 /* Events passed to congestion control interface */
 enum tcp_ca_event {
CA_EVENT_TX_START,  /* first transmit when no packets in flight */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 1d65ce1..a61a3e3 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -61,6 +61,18 @@ int sysctl_tcp_base_mss __read_mostly = 512;
 /* By default, RFC2861 behavior.  */
 int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
+static inline void tcp_packets_out_inc(struct sock *sk,
+  const struct sk_buff *skb)
+{
+   struct tcp_sock *tp = tcp_sk(sk);
+   int orig = tp-packets_out;
+
+   tp-packets_out += tcp_skb_pcount(skb);
+   if (!orig)
+   inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+ inet_csk(sk)-icsk_rto, TCP_RTO_MAX);
+}
+
 static void update_send_head(struct sock *sk, struct sk_buff *skb)
 {
struct tcp_sock *tp = tcp_sk(sk);
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 1/5] [TCP]: Remove unnecessary wrapper tcp_packets_out_dec

2007-08-20 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

Makes caller side more obvious, there's no need to have
a wrapper for this oneliner!

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/net/tcp.h |6 --
 net/ipv4/tcp_input.c  |2 +-
 net/ipv4/tcp_output.c |2 +-
 3 files changed, 2 insertions(+), 8 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 7c65989..6d586ca 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -626,12 +626,6 @@ static inline void tcp_packets_out_inc(struct sock *sk,
  inet_csk(sk)-icsk_rto, TCP_RTO_MAX);
 }
 
-static inline void tcp_packets_out_dec(struct tcp_sock *tp, 
-  const struct sk_buff *skb)
-{
-   tp-packets_out -= tcp_skb_pcount(skb);
-}
-
 /* Events passed to congestion control interface */
 enum tcp_ca_event {
CA_EVENT_TX_START,  /* first transmit when no packets in flight */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 96ced89..45ad32c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2548,7 +2548,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
last_ackt = skb-tstamp;
}
tcp_dec_pcount_approx(tp-fackets_out, skb);
-   tcp_packets_out_dec(tp, skb);
+   tp-packets_out -= tcp_skb_pcount(skb);
tcp_unlink_write_queue(skb, sk);
sk_stream_free_skb(sk, skb);
clear_all_retrans_hints(tp);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index a367917..1d65ce1 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1735,7 +1735,7 @@ static void tcp_retrans_try_collapse(struct sock *sk, 
struct sk_buff *skb, int m
 * it is better to underestimate fackets.
 */
tcp_dec_pcount_approx(tp-fackets_out, next_skb);
-   tcp_packets_out_dec(tp, next_skb);
+   tp-packets_out -= tcp_skb_pcount(next_skb);
sk_stream_free_skb(sk, next_skb);
}
 }
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 5/5] [TCP] MIB: Add counters for discarded SACK blocks

2007-08-20 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

In DSACK case, some events are not extraordinary, such as packet
duplication generated DSACK. They can arrive easily below
snd_una when undo_marker is not set (TCP being in CA_Open),
counting such DSACKs amoung SACK discards will likely just
mislead if they occur in some scenario when there are other
problems as well. Similarly, excessively delayed packets could
cause normal DSACKs. Therefore, separate counters are
allocated for DSACK events.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/linux/snmp.h |3 +++
 net/ipv4/proc.c  |3 +++
 net/ipv4/tcp_input.c |   10 +-
 3 files changed, 15 insertions(+), 1 deletions(-)

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 802b3a3..d24c554 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -231,6 +231,9 @@ enum
LINUX_MIB_TCPABORTONLINGER, /* TCPAbortOnLinger */
LINUX_MIB_TCPABORTFAILED,   /* TCPAbortFailed */
LINUX_MIB_TCPMEMORYPRESSURES,   /* TCPMemoryPressures */
+   LINUX_MIB_TCPSACKDISCARD,   /* TCPSACKDiscard */
+   LINUX_MIB_TCPDSACKIGNOREDOLD,   /* TCPSACKIgnoredOld */
+   LINUX_MIB_TCPDSACKIGNOREDNOUNDO,/* TCPSACKIgnoredNoUndo */
__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3b690cf..986d1c8 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -244,6 +244,9 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM(TCPAbortOnLinger, LINUX_MIB_TCPABORTONLINGER),
SNMP_MIB_ITEM(TCPAbortFailed, LINUX_MIB_TCPABORTFAILED),
SNMP_MIB_ITEM(TCPMemoryPressures, LINUX_MIB_TCPMEMORYPRESSURES),
+   SNMP_MIB_ITEM(TCPSACKDiscard, LINUX_MIB_TCPSACKDISCARD),
+   SNMP_MIB_ITEM(TCPDSACKIgnoredOld, LINUX_MIB_TCPDSACKIGNOREDOLD),
+   SNMP_MIB_ITEM(TCPDSACKIgnoredNoUndo, LINUX_MIB_TCPDSACKIGNOREDNOUNDO),
SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 102aefa..8692d0b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1222,8 +1222,16 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
int fack_count;
int dup_sack = (found_dup_sack  (i == first_sack_index));
 
-   if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq))
+   if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) {
+   if (dup_sack) {
+   if (!tp-undo_marker)
+   
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO);
+   else
+   
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD);
+   } else
+   NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD);
continue;
+   }
 
skb = cached_skb;
fack_count = cached_fack_count;
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/5] [TCP]: Rename tcp_ack_packets_out - tcp_rearm_rto

2007-08-20 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

Only thing that tiny function does is rearming the RTO (if
necessary), name it accordingly.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 45ad32c..2bf3d57 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2406,8 +2406,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack,
 /* Restart timer after forward progress on connection.
  * RFC2988 recommends to restart timer to now+rto.
  */
-
-static void tcp_ack_packets_out(struct sock *sk)
+static void tcp_rearm_rto(struct sock *sk)
 {
struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2560,7 +2559,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
= inet_csk(sk)-icsk_ca_ops;
 
tcp_ack_update_rtt(sk, acked, seq_rtt);
-   tcp_ack_packets_out(sk);
+   tcp_rearm_rto(sk);
 
if (tcp_is_reno(tp))
tcp_remove_reno_sacks(sk, pkts_acked);
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] [IPv6]: Invalid semicolon after if statement

2007-08-16 Thread Ilpo Järvinen

On Wed, 15 Aug 2007, David Miller wrote:

 From: Dave Jones [EMAIL PROTECTED]
 Date: Wed, 15 Aug 2007 19:52:20 -0400

  On Wed, Aug 15, 2007 at 03:08:14PM -0700, David Miller wrote:
From: Ilpo_Järvinen [EMAIL PROTECTED]
Date: Thu, 16 Aug 2007 00:57:00 +0300 (EEST)

 A similar fix to netfilter from Eric Dumazet inspired me to
 look around a bit by using some grep/sed stuff as looking for
 this kind of bugs seemed easy to automate. This is one of them
 I found where it looks like this semicolon is not valid.

 Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]

Yikes!  Makes you want to audit the entire tree for these
things :-)))

  Indeed.  Here's another one.

  Signed-off-by: Dave Jones [EMAIL PROTECTED]

 That got fixed the other day and is the A similar fix to netfilter
 from Eric Dumazet Ilpo is reffering to above :)

...heh, when I said a bit, I meant that it took a very little effort
to check over the whole tree... :-)

...I've already reported all four things one could find from whole tree 
with this cmd (to the relevant parties), so no need for you to redo the 
effort :-) :

$ for i in `find . -name '*.[ch]'`; do echo $i;
sed -n -e '/^\t*if *[(].*[)] *; *$/ N'
-e '/^\(\t*\)if *[(].*[)] *; *\n\1\t/ p' $i; done | tee result

...Basically it checks if the next line has more indentation than the
if line. ...Obviously it will work only if the code follows current 
CodingStyle in indentation. I'm currently experimenting with indent 
preprocessing step... ...but I'm not yet sure if I can pull something 
useful out from that too :-)

...Better cmdlines could be invented, e.g. by manually checking every if 
(); lines once one has first automated separation of them from if () 
do_smthg(); lines (I haven't learned myself how to easily count/work with 
parenthesis pairs in a cmdline, which seems necessary here)...

-- 
 i.

Re: drivers/infiniband/mlx/mad.c misplaced ;

2007-08-16 Thread Ilpo Järvinen


...I guess those guys hunting for broken busyloops in the other thread 
could also benefit from similar searching commands introduced in this 
thread... ...Ccing Satyam to caught their attention too.


 On Wed, Aug 15, 2007 at 05:40:11PM -0700, Joe Perches wrote:
  
  There's more than a few of these (not inspected).
  
  $ egrep -r --include=*.c \bif[[:space:]]*\([^\)]*\)[[:space:]]*\; * 

...Hmm, I plugged in a preprocessor too to manage with non compliant 
coding styles :-). Please understand that the line numbers are not an 
exact match due to preprocessor changes:

$ for i in `find . -name '*.[ch]'`; do echo $i; indent -npro -kr -i8 -ts8 
-sob -l8000 -ss -ncs -cp1 -nhnl -st $i | egrep -n [[:space:]]if [(].*[)] ;$;
done | grep -B1 ^[^.]

./arch/arm/mach-omap1/leds-innovator.c
97: if (led_state  LED_STATE_ENABLED) ;
--
./arch/mips/sibyte/cfe/console.c
23: if (written  0) ;
32: if (written  0) ;
--
./arch/powerpc/kernel/legacy_serial.c
524:if (0) ;
--
./arch/powerpc/xmon/ppc-opc.c
938:else if (value == 0) ;
--
./arch/sh/boards/se/7343/io.c
137:if (0) ;
--
./arch/um/kernel/tt/tracer.c
254:if (WIFEXITED(status)) ;
--
./arch/x86_64/ia32/ptrace32.c
363:if (__copy_from_user(child-thread.i387.fxsave, u, 
sizeof(*u))) ;
--
./arch/x86_64/kernel/traps.c
801:if (eregs == (struct pt_regs *)eregs-rsp) ;
--
./drivers/atm/iphase.c
159:if (!desc1) ;
--
./drivers/isdn/capi/capiutil.c
456:else if (c = 0x0f) ;
--
./drivers/isdn/hisax/hfc_pci.c
125:if (Read_hfc(cs, HFCPCI_INT_S1)) ;
155:if (Read_hfc(cs, HFCPCI_INT_S1)) ;
1483:   if (Read_hfc(cs, HFCPCI_INT_S1)) ;
--
./drivers/isdn/hisax/hfc_sx.c
377:if (Read_hfc(cs, HFCSX_INT_S1)) ;
407:if (Read_hfc(cs, HFCSX_INT_S2)) ;
1246:   if (Read_hfc(cs, HFCSX_INT_S1)) ;
--
./drivers/media/video/video-buf.c
1141:   if (q-bufs[i]) ;
--
./drivers/net/lp486e.c
777:if (lp-scb.command  i596_timeout(dev, i596_cleanup_cmd, 100)) ;
785:if (lp-scb.command  i596_timeout(dev, i596_reset, 100)) ;
794:if (lp-scb.command  i596_timeout(dev, i596_reset(2), 400)) ;
820:if (lp-scb.command  i596_timeout(dev, i596_add_cmd, 100)) ;
1146:   if (lp-scb.command  i596_timeout(dev, interrupt, 40)) ;
1192:   if (lp-scb.command  i596_timeout(dev, i596 interrupt, 100)) ;
1217:   if (lp-scb.command  i596_timeout(dev, i596_close, 200)) ;
--
./drivers/net/ni5010.c
273:if (dev-irq == 0xff) ;
--
./drivers/net/ni52.c
648:if (result  TDR_LNK_OK) ;
--
./drivers/net/sun3_82586.c
498:if (result  TDR_LNK_OK) ;
--
./drivers/pci/hotplug/ibmphp_core.c
418:else if (mode == BUS_MODE_PCI) ;
636:else if (mode == BUS_MODE_PCI) ;
--
./drivers/usb/gadget/file_storage.c
2480:   if (protocol_is_scsi()) ;
--
./drivers/usb/host/uhci-debug.c
416:if (i = SKEL_ISO) ;
419:else if (!uhci-fsbr_is_on) ;
--
./drivers/usb/host/uhci-q.c
541:if (qh-skel == SKEL_ISO) ;
--
./drivers/usb/misc/usbtest.c
1401:   if (status != 0) ;
--
./drivers/video/intelfb/intelfbdrv.c
337:if (get_opt_bool(this_opt, accel, accel)) ;
338:else if (get_opt_int(this_opt, vram, vram)) ;
339:else if (get_opt_bool(this_opt, hwcursor, hwcursor)) ;
340:else if (get_opt_bool(this_opt, mtrr, mtrr)) ;
341:else if (get_opt_bool(this_opt, fixed, fixed)) ;
--
./drivers/video/matrox/matroxfb_DAC1064.c
46: if (fvco = 10) ;
--
./drivers/video/matrox/matroxfb_maven.c
298:if (fvco = 1) ;
316:if (fvco = 10) ;
--
./fs/hfs/inode.c
72: if (!node) ;
--
./fs/hfsplus/inode.c
67: if (!node) ;
--
./fs/hostfs/hostfs_user.c
300:if (attrs-ia_valid  HOSTFS_ATTR_CTIME) ;
--
./fs/xfs/xfs_bmap.c
2287:   if (nullfb || XFS_FSB_TO_AGNO(mp, ap-rval) == fb_agno) ;
--
./fs/xfs/xfs_dir2.c
281:else if ((rval = xfs_dir2_isblock(tp, dp, v))) ;
--
./fs/xfs/xfs_iomap.c
248:if (io-io_flags  XFS_IOCORE_RT) ;
--
./include/asm-cris/uaccess.h
255:if (n == 0) ;
303:if (n == 0) ;
351:if (n == 0) ;
--
./mm/swapfile.c
791:if (swcount = 1) ;
--
./net/core/pktgen.c
2256:   if (pkt_dev-min_in6_daddr.s6_addr32[0] == 0  
pkt_dev-min_in6_daddr.s6_addr32[1] == 0  pkt_dev-min_in6_daddr.s6_addr32[2] 
== 0  pkt_dev-min_in6_daddr.s6_addr32[3] == 0) ;
--
./net/irda/af_irda.c
1357:   if (ret) ;
1358:   else if (sk-sk_shutdown  RCV_SHUTDOWN) ;
--
./net/irda/irnetlink.c
105:if (nla_put_string(msg, IRDA_NL_ATTR_IFNAME, dev-name)) ;
--
./net/netfilter/xt_u32.c
38: if (skb-len  4 || pos  skb-len - 4) ;
--
./sound/pci/au88x0/au88x0_core.c
2076:   if (vortex_adbdma_bufshift(vortex, i)) ;
2085:   if (vortex_wtdma_bufshift(vortex, i)) ;
--

Re: [PATCH 0/24] make atomic_read() behave consistently across all architectures

2007-08-16 Thread Ilpo Järvinen

On Thu, 16 Aug 2007, Herbert Xu wrote:

 We've been through that already.  If it's a busy-wait it
 should use cpu_relax. 

I looked around a bit by using some command lines and ended up wondering 
if these are equal to busy-wait case (and should be fixed) or not:

./drivers/telephony/ixj.c
6674:   while (atomic_read(j-DSPWrite)  0)
6675-   atomic_dec(j-DSPWrite);

...besides that, there are couple of more similar cases in the same file 
(with braces)...


-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] [IPv6]: Invalid semicolon after if statement

2007-08-15 Thread Ilpo Järvinen


A similar fix to netfilter from Eric Dumazet inspired me to
look around a bit by using some grep/sed stuff as looking for
this kind of bugs seemed easy to automate. This is one of them
I found where it looks like this semicolon is not valid.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv6/ipv6_sockglue.c |2 +-
 1 files changed, 1 insertions(+), 1 deletions(-)

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index d684639..761a910 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -820,7 +820,7 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct 
ipv6_txoptions *opt,
return 0;
 
len = min_t(unsigned int, len, ipv6_optlen(hdr));
-   if (copy_to_user(optval, hdr, len));
+   if (copy_to_user(optval, hdr, len))
return -EFAULT;
return ipv6_optlen(hdr);
 }
-- 
1.5.0.6

Re: net-2.6.24 GIT state

2007-08-10 Thread Ilpo Järvinen

On Thu, 9 Aug 2007, David Miller wrote:

 I applied everything up until the SACK validator to net-2.6.24

Ok, thanks.

 Everything I hit today which had not been posted before was trivial
 fix or a reasonable small cleanup.

...Yeah, I know that but don't want to give impression that something 
gets sneaked in...

 Why don't you just post the patches to the list one-by-one from
 now on so we can review them in parallel with merging into
 net-2.6.24?

Ok, will do. ...I thought that at some point that would be wise way 
forward...


-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-2.6.24] [TCP]: Update comment about highest_sack validity

2007-08-10 Thread Ilpo Järvinen


This stale info came from the original idea, which proved to be
unnecessarily complex, sacked_out  0 is easy to do and that when
it's going to be needed anyway (it _can_ be valid also when
sacked_out == 0 but there's not going to be a guarantee about it
for now).

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/linux/tcp.h |3 ++-
 1 files changed, 2 insertions(+), 1 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 1f12fa0..f8cf090 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -332,7 +332,8 @@ struct tcp_sock {
 
struct tcp_sack_block_wire recv_sack_cache[4];
 
-   u32 highest_sack;   /* Start seq of globally highest revd SACK 
(valid only in slowpath) */
+   u32 highest_sack;   /* Start seq of globally highest revd SACK
+* (validity guaranteed only if sacked_out  0) 
*/
 
/* from STCP, retrans queue hinting */
struct sk_buff* lost_skb_hint;
-- 
1.5.0.6

Re: net-2.6.24 GIT state

2007-08-09 Thread Ilpo Järvinen

On Thu, 9 Aug 2007, David Miller wrote:

 Ok, as is clear for some the net-2.6.24 tree has been cut
 and is at:
 
   kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6.24.git
 
 I'll be going over the TCP bits Ilpo and I have been discussing

...In case I didn't make it clear previously, you'll notice that there are 
some non-submitted changes rebased as well on the top of it.

 then expect me to disappear back to driver hacking :-)  As is,
 there should be enough in there for people to play with,
 regression test, and improve.

At least these are trivial to take (after rebase, some come nicely even 
from the before tree):

40564051bdb237231e625ba674fdedf6e8373027 [TCP]: Remove num_acked0 checks from 
cong.ctrl mods pkts_acked
4c035cd78a6b60710b0dda4f62339df070f761c8 [TCP]: Add tcp_dec_pcount_approx int 
variant
ed2753b0b463df41c10121ce494d51428047fcbc [TCP]: Move code from tcp_ecn.h to 
tcp*.c and tcp.h  remove it
c215314c6ea0261865d3df09f375016f3dcadeba [TCP]: Access to highest_sack 
obsoletes forward_cnt_hint
9e5f432fb247af2f0062c3c57d7f45d511692f26 [TCP] FRTO: remove unnecessary 
fackets/sacked_out recounting
96001b306c60cb969d456ac70e376220db97e54a [TCP]: Move Reno SACKed_out counter 
functions earlier
318cf753170367504cfac07ac89e97befb1ca501 [TCP]: Extract DSACK detection code 
from tcp_sacktag_write_queue().
a2539974b098065ebe02ad11a6411df4f56ba0ed [TCP]: Rexmit hint must be cleared 
instead of setting it
  - Could be combined with the extrator as it's really a bug fix, once 
rxmit cnt hint gets dropped, we can add this back as optimization :-).
2fea67411f0c89642fa4abc0490b65c7852a1202 [TCP]: Extracted rexmit hint clearing 
from the LOST marking code
1fba6548b2ecc2d513981898247472d1183329c5 [TCP]: Add highest_sack seqno, points 
to globally highest SACK

...I was claiming incorrectly last time that highest_sack stuff needs 
SACK-block validation, it does not as it takes seq from skb-seq, not from 
the receiver like I thought :-). ...Redoing the validator (the improved 
version) on top of net-2.6.24 is on my high priority list though, so 
maybe I'll have it already on today.

Left_out removal should be safe too: after my analysis, it seems safe as 
most of the functions are if-modified-then-sync type, some sync always. 
Clean_rtx_queue leaves left_out to old state which is the hardest thing to 
track. However, TCP should always execute fastretrans_alert (that syncs 
near entry) if sacked_out+lost_out is (or was) non-zero, so it shouldn't 
leak stale state anywhere. Not going to fastretrans_alert would be IMHO a 
bug that should be fixed. I found one such case which is fixed in [TCP]: 
Keep state in Disorder also if only lost_out  0). Also FRTO that is 
called before fastretrans_alert sync at entry, so it shouldn't be a 
problem to it either. 

...Taking IsReno/IsFack removal is probably going to be hardest one, as it 
will conflict a lot. ...Maybe some sed work in patches would reduce # of 
conflicts instead of trying cherry-pick+rebase.

-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: net-2.6.24 GIT state

2007-08-09 Thread Ilpo Järvinen

On Thu, 9 Aug 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Thu, 9 Aug 2007 12:50:51 +0300 (EEST)

 Ok, in order to get this started I merged the above patches
 into net-2.6.24

 I'm exhausted and done for today. :-)

:-)

 We can work on the other bits next. 

Next step:

http://www.cs.helsinki.fi/u/ijjarvin/tcp-rebase/after-reorder/

...I put some effort to reorganization and there's the result, please 
_don't_ take the new SACK block-validator yet as I haven't yet put it 
under any testing (it's the only difference this reordering of patches 
seems to have introduced :-)). The first five commits touch left_out (4 
latest fix bug and flaws I made in the first commit), I could combine 
them into a single one if you want to.

...Notice that part of the next step includes work that hasn't yet been 
posted to netdev at all (was from my not yet submitted wip portion of 
it). Would you prefer me to post them using the usual procedure?

  ...Taking IsReno/IsFack removal is probably going to be hardest one, as it 
  will conflict a lot. ...Maybe some sed work in patches would reduce # of 
  conflicts instead of trying cherry-pick+rebase.

 Yes, such patch hand editing techniques can help in these cases.

...it helped a lot in the process :-).

-- 
 i.

Re: [RFC] stuff from tcp-2.6 partially merged to upcoming net-2.6.24?

2007-08-08 Thread Ilpo Järvinen

On Wed, 8 Aug 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Tue, 7 Aug 2007 16:19:58 +0300 (EEST)

  Do you have any suggestion how I should proceed? Or do you perhaps object 
  such partial merge completely? ...I could try to come up with a cleaned up 
  patch series which has original and their bug fix parts combined to a 
  single patch per change (would provide cleaner history and shouldn't be 
  very hard to do either)...

 Thank you for doing the rebase.  I agree with you that we should
 seperate out as much of the non-rb-tree stuff as possible and put it
 into net-2.6.24

...It would also help a lot in case we at some point of time in the future 
decide to merge rb-tree stuff but then have to back up to have it merged 
in a cleaner state which is easier to revert than with cleanups 
combined... Besides once I get some additional bits done (haven't yet 
dared to do L-bit cascade surgery which will fix timedout loop entry 
bugs and allow dropping of retrans hint counter), long enough mm sit would 
be nice route to rbtree stuff since there it's getting at least bit more
versitile testing that we alone can put to it...

 I'll will try hard to make time to look at your rebase and try to move
 things forward. 

...I case you want to validate them, git-patch-id helps to exclude 
identical changes between before and after allowing you to spend more
time on things that required non-trivial resolution...

 I've been all-consumed by the NAPI work and a driver I've been writing 
 in the background for the past few weeks.

...I can try to help... I suggest that as first step we take all changes 
that do not cause any conflicts (I've already tried cherry-picks), only 
thing I'm not sure whether I should combine change+fix parts or keep 
them as they were in tcp-2.6 (I suggest we combine them but you may 
disagree..?). I can either post them as series or give you them in 
--pretty=oneline format if you want to cherry-pick them yourself (in 
that case having a common tree would help a bit as commit ids would
match as well then)).

However, at least highest_sack inclusion (no space loss as one hint 
counter can be then dropped and allows accurate fackets out which I've 
partially coded already) will need also SACK-block validator. But I
basically have to redo as the validator patch was sacktag reorganization 
based previously (I've already found couple of improvements too to it's
accuracy :-))...

-- 
 i.

Re: TCP's initial cwnd setting correct?...

2007-08-08 Thread Ilpo Järvinen

On Tue, 7 Aug 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Mon, 6 Aug 2007 15:37:15 +0300 (EEST)

  ...Another thing that makes me wonder, is the tp-mss_cache  1460 check 
  as based on rfc3390 (and also it's precursor rfc2414) with up to 2190 
  bytes MSS TCP can use 3 as initial cwnd...

 I did the research and my memory was at least partially right.

 Below is an old bogus change of mine and the later revert with
 Alexey's explanation.

 This seems to be dealing with receive window calculation issues,
 rather than snd_cwnd.  But they might be related and you should
 consider this very seriously.

...Thanks about the info, I'll study it carefully and see what's
relevant in here too. And anyway we're currently on the safer side
as the potential issues only make it more conservative than the RFC 
allows, so there's no hurry to get it in until the solution is
acceptable if there's indeed a need for a change.

-- 
 i.

[RFC] stuff from tcp-2.6 partially merged to upcoming net-2.6.24?

2007-08-07 Thread Ilpo Järvinen

Hi Dave,

...Noticed you were planning to open net-2.6.24 tree... IMHO, part of the 
stuff in tcp-2.6 could be merged to 2.6.24. I suggest that most of the 
stuff which is not directly related to the rbtree, new lost marker, nor
sacktag reorganization are taken. Some of those things are very trivial
to take as they do not introduce have any conflicts. Besides that there 
are some stuff that would need some work if takes as they are built on 
top of stuff that will remain only in tcp-2.6 (includes left_out removal 
and IsReno/Fack conversion)... But if it's ok, I could try to come up with 
a solution even to them... Perhaps do this in two (or more) stages by 
first taking the trivial ones...

I tried rebasing tcp-2.6 (there's some not yet submitted work on top of 
it too) to top of be1b685fe6c9928848b26b568eaa86ba8ce0046c, result is 
here:
  http://www.cs.helsinki.fi/u/ijjarvin/tcp-rebase/{before,after}
...There was at least one gotcha (sacktag's flag reset position change
when sacktag_state is created). But all in all, conflicts weren't that 
hard to resolve... One may resolve some things differently than I did,
so YMMV if you want to try that yourself... :-) ...I also diffed
all.patch'es to see if there was some undesired side-effect from diff 
but didn't find any. Currently only compile tested.

Do you have any suggestion how I should proceed? Or do you perhaps object 
such partial merge completely? ...I could try to come up with a cleaned up 
patch series which has original and their bug fix parts combined to a 
single patch per change (would provide cleaner history and shouldn't be 
very hard to do either)...

-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

TCP's initial cwnd setting correct?...

2007-08-06 Thread Ilpo Järvinen

I'm I missing something or why it seems to me that whenever rtt stuff is 
reset, also tcp_init_cwnd is not called to do RFC3390-like initial 
window setting (remains then in 2 which is set by tcp_v4_init_sock)...
Is this done on purpose or what?

...Another thing that makes me wonder, is the tp-mss_cache  1460 check 
as based on rfc3390 (and also it's precursor rfc2414) with up to 2190 
bytes MSS TCP can use 3 as initial cwnd...

...Compile tested patch below in case these are valid concerns... ...Goto 
logic could be cleaner (somebody has any suggestion for better way to 
structure it?)

-- 
 i.


[PATCH] [TCP]: Use IW also when RTT vars get reset  update to match RFC

Previously when RTT got reset, initial window was not being set.

Initial window defined RFC3390 (and it's precursor 2414) allows
using initial window of three as long as MSS  2190 bytes.

Also update RFC2414 references as it's obsoleted by RFC3390.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c  |   32 +---
 net/ipv4/tcp_output.c |2 +-
 2 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index eb96864..89794ec 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -805,13 +805,13 @@ void tcp_update_metrics(struct sock *sk)
}
 }
 
-/* Numbers are taken from RFC2414.  */
+/* Numbers are taken from RFC3390.  */
 __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
 {
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);
 
if (!cwnd) {
-   if (tp-mss_cache  1460)
+   if (tp-mss_cache = 2190)
cwnd = 2;
else
cwnd = (tp-mss_cache  1095) ? 3 : 4;
@@ -897,22 +897,24 @@ static void tcp_init_metrics(struct sock *sk)
}
tcp_set_rto(sk);
tcp_bound_rto(sk);
-   if (inet_csk(sk)-icsk_rto  TCP_TIMEOUT_INIT  !tp-rx_opt.saw_tstamp)
-   goto reset;
-   tp-snd_cwnd = tcp_init_cwnd(tp, dst);
-   tp-snd_cwnd_stamp = tcp_time_stamp;
-   return;
+
+   if (inet_csk(sk)-icsk_rto  TCP_TIMEOUT_INIT  
!tp-rx_opt.saw_tstamp) {
 
 reset:
-   /* Play conservative. If timestamps are not
-* supported, TCP will fail to recalculate correct
-* rtt, if initial rto is too small. FORGET ALL AND RESET!
-*/
-   if (!tp-rx_opt.saw_tstamp  tp-srtt) {
-   tp-srtt = 0;
-   tp-mdev = tp-mdev_max = tp-rttvar = TCP_TIMEOUT_INIT;
-   inet_csk(sk)-icsk_rto = TCP_TIMEOUT_INIT;
+   /* Play conservative. If timestamps are not
+* supported, TCP will fail to recalculate correct
+* rtt, if initial rto is too small. FORGET ALL AND RESET!
+*/
+   if (!tp-rx_opt.saw_tstamp  tp-srtt) {
+   tp-srtt = 0;
+   tp-mdev = tp-mdev_max = tp-rttvar = TCP_TIMEOUT_INIT;
+   inet_csk(sk)-icsk_rto = TCP_TIMEOUT_INIT;
+   }
}
+
+   tp-snd_cwnd = tcp_init_cwnd(tp, dst);
+   tp-snd_cwnd_stamp = tcp_time_stamp;
+   return;
 }
 
 static void tcp_update_reordering(struct sock *sk, const int metric,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 3abe22e..4f9be23 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -209,7 +209,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
}
 
/* Set initial window to value enough for senders,
-* following RFC2414. Senders, not following this RFC,
+* following RFC3390. Senders, not following this RFC,
 * will be satisfied with 2.
 */
if (mss  (1*rcv_wscale)) {
-- 
1.5.0.6

Re: TCP's initial cwnd setting correct?...

2007-08-06 Thread Ilpo Järvinen

On Mon, 6 Aug 2007, Ilpo Järvinen wrote:

 ...Goto logic could be cleaner (somebody has any suggestion for better 
 way to structure it?)

...I could probably move the setting of snd_cwnd earlier to avoid 
this problem if this seems a valid fix at all.

-- 
 i.

Re: TCP SACK issue, hung connection, tcpdump included

2007-08-02 Thread Ilpo Järvinen

On Tue, 31 Jul 2007, Darryl L. Miles wrote:

 I've been able to capture a tcpdump from both ends during the problem and its
 my belief there is a bug in 2.6.20.1 (at the client side) in that it issues a
 SACK option for an old sequence which the current window being advertised is
 beyond it.  This is the most concerning issue as the integrity of the sequence
 numbers doesn't seem right (to my limited understanding anyhow).

You probably didn't check the reference I explicitly gave to those who 
are not familiar how DSACK works, just in case you didn't pick it up last 
time, here it is again for you: RFC2883... However, if DSACKs really 
bother you still (though it shouldn't :-)), IIRC I also told you how 
you're able to turn it off (tcp_dsack sysctl) but I assure you that it's 
not a bug but feature called DSACK [RFC2883], there's _absolutely_ nothing 
wrong with it, instead, it would be wrong to _not_ send the below snd_una 
SACK in this scenario when tcp_dsack set to 1.

 There is another concern of why the SERVER performed a retransmission in the
 first place, when the tcpdump shows the ack covering it has been seen.

There are only three possible reasons to this thing:
1) The ACK didn't reach the SERVER (your logs prove this to not be the 
case)
2) The ACK got discarded by the SERVER
3) The SERVER (not the client) is buggy and sends an incorrect 
retransmission

...So we have just two options remaining...

 I have made available the full dumps at:
 
 http://darrylmiles.org/snippets/lkml/20070731/

Thanks about these... Based on a quick check, it is rather clear that the 
SERVER is for some reason discarding the packets it's receiving:

04:11:26.833935 IP CLIENT.43726  SERVER.ssh: P 4239:4287(48) ack 28176 win 501 
nop,nop,timestamp 819646456 16345815
04:11:27.132425 IP SERVER.ssh  CLIENT.43726: . 26016:27464(1448) ack 4239 win 
2728 nop,nop,timestamp 17096579 819458864
04:11:27.230081 IP CLIENT.43726  SERVER.ssh: . ack 28176 win 501 
nop,nop,timestamp 819646555 16345815,nop,nop

Notice, (cumulative) ack field didn't advance though new data arrived, and 
for the record, it's in advertised window too. There are no DSACK in here 
so your theory about below snd_una SACK won't help to explain this one 
at all... We'll just have to figure out why it's discarding it. And 
there's even more to prove this...

 This sequence is interesting from the client side:

 03:58:56.419034 IP SERVER.ssh  CLIENT.43726: . 26016:27464(1448) ack 4239
 win 2728 nop,nop,timestamp 16345815 819458859 # S1
 03:58:56.419100 IP CLIENT.43726  SERVER.ssh: . ack 27464 win 501
 nop,nop,timestamp 819458884 16345815 # C1
 03:58:56.422019 IP SERVER.ssh  CLIENT.43726: P 27464:28176(712) ack 4239
 win 2728 nop,nop,timestamp 16345815 819458859 # S2
 03:58:56.422078 IP CLIENT.43726  SERVER.ssh: . ack 28176 win 501
 nop,nop,timestamp 819458884 16345815 # C2

 The above 4 packets look as expect to me.  Then we suddenly see a
 retransmission of 26016:27464.
 
 03:58:56.731597 IP SERVER.ssh  CLIENT.43726: . 26016:27464(1448) ack 4239
 win 2728 nop,nop,timestamp 16346128 819458864 # S3

...Look at this on the retransmission:
   ... timestamp 16346128 819458864

...it tells us what really got received by the TCP. The corresponding ACK 
with matching timestamp is, surprise, surprise, this one:

 03:58:56.340180 IP CLIENT.43726  SERVER.ssh: . ack 26016 win 501 
 nop,nop,timestamp 819458864 16345734

...thus the SERVER has _not_ received but discarded the subsequent 
cumulative ACKs!!! Therefore it's retransmitting from 26016 onward but 
never receives any reply as everything seems to get discarded...

There was one bad checksum btw:

 03:58:56.365662 IP (tos 0x10, ttl  64, id 28685, offset 0, flags [DF], 
 proto 6, length: 764) SERVER.ssh  CLIENT.43726: P [bad tcp cksum 6662 
 (-ef2b)!] 617734888:617735600(712) ack 2634113543 win 2728 
 nop,nop,timestamp 16345815 819458859


 There are some changes in 2.6.22 that appear to affect TCP SACK handling
 does this fix a known issue ?

There is no such known issue :-)... This issue has nothing to do with 
TCP SACK handling, since that code _won't_ be reached... We could verify 
that from the timestamps. But if you still insist that SACK under snd_una 
is the issue, please turn tcp_dsack to 0 on the CLIENT, you will not get 
them after that and you can be happy as your out-of-window SACK issue
is then fixed :-)... 

...Seriously, somebody else than me is probably better in suggesting what 
could cause the discarding at the SERVER in this case. SNMP stuff Dave was 
asking could help, you can find them from /proc/net/{netstat,snmp}...


-- 
 i.

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH 1/2] [TCP]: Also handle snd_una changes in tcp_cwnd_down

2007-08-02 Thread Ilpo Järvinen

On Wed, 1 Aug 2007, Ilpo Järvinen wrote:

 
 tcp_cwnd_down must check for it too as it should be conservative
 in case of collapse stuff and also when receiver is trying to
 lie (though that wouldn't be very successful/useful anyway).
 
 Note:
 - Separated also is_dupack and do_lost in fast_retransalert
   * Much cleaner look-and-feel now
   * This time it really fixes cumulative ACK with many new
 SACK blocks recovery entry (I claimed this fixes with
 last patch but it wasn't). TCP will now call
 tcp_update_scoreboard regardless of is_dupack when
 in recovery as long as there is enough fackets_out.
 - Introduce FLAG_SND_UNA_ADVANCED
   * Some prior_snd_una arguments are unnecessary after it
 - Added helper FLAG_ANY_PROGRESS to avoid long FLAG...|FLAG...
   constructs
 
 Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
 ---
 
 Dave, BEWARE, I wasn't able to do anything else but compile test
 because Linus' tree didn't seem to boot on the machine I was
 trying to test it... :-(
 
 I think that to stable version only a small part of this change
 is necessary, not the full changeset. That should keep stable
 folks much happier... :-) I'll soon put my reduced proposal to:
   http://www.cs.helsinki.fi/u/ijjarvin/patches/stable-0001.patch
 The other patch (DSACK) can go to stable as is.

I placed those two earlier sent bidir fixes and these two additional fixes 
on top of 2.6.22 and was finally able to have them tested on a bootable 
kernel (I had a boot failure on another machine too with 2.6.23-rc1 
stuff). FACKNewReno/bidir and FACK/unidir tested, time-seq graphs were 
ok.

Dave, please put these two patches to net-2.6 to complete bidir fix 
series. ...And please push to stable as well, take just the minimized 
fix portion of this [TCP]: Also handle snd_una changes in 
tcp_cwnd_down patch as I described above. Other cleanups in it can be
put just to net-2.6.

-- 
 i.

Re: TCP SACK issue, hung connection, tcpdump included

2007-08-02 Thread Ilpo Järvinen

...I dropped lkml, it's useless to bother them with this network related 
stuff...

On Thu, 2 Aug 2007, Darryl Miles wrote:
 Ilpo Järvinen wrote:
  On Tue, 31 Jul 2007, Darryl L. Miles wrote:

[...RFC bashing, snip...] 

  * The older linux kernel for not being 100% SACK RFC compliant in its
 implementation ?  Not a lot we can do about this now, but if we're able
 to identify there maybe backward compatibility issues with the same
 implementation thats a useful point to take forward.

  * The newer linux kernel for enabling D-SACK by default when RFC2883
 doesn't even claim a cast iron case for D-SACK to be compatible with any
 100% RFC compliant SACK implementation.

Are you aware that D-SACK processing and generation has been part of the 
linux kernel TCP far before 2.6 series even begun... ...and it goes far 
beoynd that, 2.4.0 had it too (2.2.0 didn't seem to have it, never before 
have I read that one IIRC :-) ).

 Does Ilpo have a particular vested interest in D-SACK that should be 
 disclosed?

Sure :-). ...my interest was to show you that it's not a bug :-).

 So it is necessary to turn off a TCP option (that is enabled by default)
 to be sure to have reliable TCP connections (that don't lock up) in the
 bugfree Linux networking stack ?  This is absurd.

...You'll have to turn a lot off to be compatible with everything around 
Internet, and still you would probably fail. Some people have to, e.g., to 
turn of window scaling to work-around buggy intermediate nodes (nat boxes 
or some firewalls), there's even a sysctl to workaround signed 16-bit 
window arithmetic bugs that's mostly legacy but I bet you can find host 
broken in that area too. Etc. Yet we don't off those by default.

 If such an option causes such a problem; then that option should not be
 enabled by default. 

...Linux TCP has enabled by default option which are _known_ (at least 
nowadays) to cause bad problems and many of them are _still_ enabled... 
Browse archives if you don't believe me... And I'm relatively sure it will 
do so also in future though I'm not the maintainer nor anybody to make 
such decisions...

 rather than wallpaper over the cracks with the voodoo of turning things 
 that are enabled by default off.

...I said that because it felt like you kept repeating that the generated 
DSACK block is a bug even though, like you now know, it's a feature, not a 
bug. :-)

  2) The ACK got discarded by the SERVER
 
 I'd thought about that one, its a possibility.  The server in question
 does have period of high UDP load on a fair number of UDP sockets at
 once (a few 1000).  Both systems have 2Gb of RAM.  The server has maybe
 just 250Mb of RSS of all apps combined.

...There are three independent signs in the log to indicate discard out
of these 3 reasons. Whereas your theory _fails_ to explain some behavior 
in the log you presented, e.g., not updated timestamp which happen even 
_before_ the DSACK stuff?!?... I'll formulate this question: why didn't 
snd_una advance nor timestamp update though a cumulative ACK arrived?
You can check for yourself (in server log):

03:58:56.384503
03:58:56.462583
03:58:56.465707
03:58:56.678546

...I'm hoping SNMPs provide explanation to it.

 The client sent a SACK.  But from understanding more about D-SACK, this
 is a valid D-SACK response, but it appears to confuse the older Linux
 kernel at the server end.

...Are you saying that it's confused by _DSACK_ just because it's only 
strange thing you seem to find from the log? I see other things in your 
log which are exceptional and point to elsewhere... Please don't neglect 
them... ...Problems occur already before that DSACK is received by the 
server end.

 Agreed on this.  However discarding data is allowed (providing it is
 intentional discarding not a bug where the TCP stack is discarding segments it
 shouldn't), TCP should recover providing sufficient packets get through.

But if one end decides to discard everything after time t, TCP _will
not_ recover because sufficient packets won't get through... And 
that's what your log is telling me. Yes discarding is allowed but that 
wasn't the point, we're more interested here on why it got discarded
rather than allowance of discarding.

 Forgive me if I am mistaken, but while the server reports a checksum
 error, the client did not.  I took this to be a misreporting by tcpdump
 at the server, probably due to the e1000 network card checksum
 offloading

...That's probably the reason, I agree, these show up. Thought that also 
myself, besides, it wouldn't cause that kind of breakage anyway.

 So the SNMP data would show up intentional discards (due to memory/resource
 issues).  So I'll get some of those too.
 
 The SNMP stats aren't so useful right now as
 the box has been rebooted since then but I shall attempt to capture
 /proc/net/* data, cause the problem, then capture /proc/net/* data again
 if those numbers can help.

Good, thanks. 

-- 
 i.

[PATCH 2/2] [TCP]: DSACK signals data receival, be conservative

2007-08-01 Thread Ilpo Järvinen


In case a DSACK is received, it's better to lower cwnd as it's
a sign of data receival.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |5 -
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index c3124e6..f030435 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -103,6 +103,7 @@ int sysctl_tcp_abc __read_mostly;
 #define FLAG_SLOWPATH  0x100 /* Do not skip RFC checks for window 
update.*/
 #define FLAG_ONLY_ORIG_SACKED  0x200 /* SACKs only non-rexmit sent before RTO 
*/
 #define FLAG_SND_UNA_ADVANCED  0x400 /* Snd_una was changed (!= 
FLAG_DATA_ACKED) */
+#define FLAG_DSACKING_ACK  0x800 /* SACK blocks contained DSACK info */
 
 #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
 #define FLAG_NOT_DUP   (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -966,12 +967,14 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
 
/* Check for D-SACK. */
if (before(ntohl(sp[0].start_seq), TCP_SKB_CB(ack_skb)-ack_seq)) {
+   flag |= FLAG_DSACKING_ACK;
found_dup_sack = 1;
tp-rx_opt.sack_ok |= 4;
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks  1 
!after(ntohl(sp[0].end_seq), ntohl(sp[1].end_seq)) 
!before(ntohl(sp[0].start_seq), 
ntohl(sp[1].start_seq))) {
+   flag |= FLAG_DSACKING_ACK;
found_dup_sack = 1;
tp-rx_opt.sack_ok |= 4;
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
@@ -1858,7 +1861,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag)
struct tcp_sock *tp = tcp_sk(sk);
int decr = tp-snd_cwnd_cnt + 1;
 
-   if ((flagFLAG_ANY_PROGRESS) ||
+   if ((flag(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) ||
(IsReno(tp)  !(flagFLAG_NOT_DUP))) {
tp-snd_cwnd_cnt = decr1;
decr = 1;
-- 
1.5.0.6

[PATCH 1/2] [TCP]: Also handle snd_una changes in tcp_cwnd_down

2007-08-01 Thread Ilpo Järvinen


tcp_cwnd_down must check for it too as it should be conservative
in case of collapse stuff and also when receiver is trying to
lie (though that wouldn't be very successful/useful anyway).

Note:
- Separated also is_dupack and do_lost in fast_retransalert
* Much cleaner look-and-feel now
* This time it really fixes cumulative ACK with many new
  SACK blocks recovery entry (I claimed this fixes with
  last patch but it wasn't). TCP will now call
  tcp_update_scoreboard regardless of is_dupack when
  in recovery as long as there is enough fackets_out.
- Introduce FLAG_SND_UNA_ADVANCED
* Some prior_snd_una arguments are unnecessary after it
- Added helper FLAG_ANY_PROGRESS to avoid long FLAG...|FLAG...
  constructs

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---

Dave, BEWARE, I wasn't able to do anything else but compile test
because Linus' tree didn't seem to boot on the machine I was
trying to test it... :-(

I think that to stable version only a small part of this change
is necessary, not the full changeset. That should keep stable
folks much happier... :-) I'll soon put my reduced proposal to:
  http://www.cs.helsinki.fi/u/ijjarvin/patches/stable-0001.patch
The other patch (DSACK) can go to stable as is.

 net/ipv4/tcp_input.c |   34 ++
 1 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 378ca8a..c3124e6 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -102,11 +102,13 @@ int sysctl_tcp_abc __read_mostly;
 #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. 
*/
 #define FLAG_SLOWPATH  0x100 /* Do not skip RFC checks for window 
update.*/
 #define FLAG_ONLY_ORIG_SACKED  0x200 /* SACKs only non-rexmit sent before RTO 
*/
+#define FLAG_SND_UNA_ADVANCED  0x400 /* Snd_una was changed (!= 
FLAG_DATA_ACKED) */
 
 #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED)
 #define FLAG_NOT_DUP   (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
 #define FLAG_CA_ALERT  (FLAG_DATA_SACKED|FLAG_ECE)
 #define FLAG_FORWARD_PROGRESS  (FLAG_ACKED|FLAG_DATA_SACKED)
+#define FLAG_ANY_PROGRESS  (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED)
 
 #define IsReno(tp) ((tp)-rx_opt.sack_ok == 0)
 #define IsFack(tp) ((tp)-rx_opt.sack_ok  2)
@@ -1856,7 +1858,7 @@ static void tcp_cwnd_down(struct sock *sk, int flag)
struct tcp_sock *tp = tcp_sk(sk);
int decr = tp-snd_cwnd_cnt + 1;
 
-   if ((flagFLAG_FORWARD_PROGRESS) ||
+   if ((flagFLAG_ANY_PROGRESS) ||
(IsReno(tp)  !(flagFLAG_NOT_DUP))) {
tp-snd_cwnd_cnt = decr1;
decr = 1;
@@ -2107,15 +2109,13 @@ static void tcp_mtup_probe_success(struct sock *sk, 
struct sk_buff *skb)
  * tcp_xmit_retransmit_queue().
  */
 static void
-tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
- int prior_packets, int flag)
+tcp_fastretrans_alert(struct sock *sk, int prior_packets, int flag)
 {
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
-   int is_dupack = (tp-snd_una == prior_snd_una 
-(!(flagFLAG_NOT_DUP) ||
- ((flagFLAG_DATA_SACKED) 
-  (tp-fackets_out  tp-reordering;
+   int is_dupack = !(flag(FLAG_SND_UNA_ADVANCED|FLAG_NOT_DUP));
+   int do_lost = is_dupack || ((flagFLAG_DATA_SACKED) 
+   (tp-fackets_out  tp-reordering));
 
/* Some technical things:
 * 1. Reno does not count dupacks (sacked_out) automatically. */
@@ -2192,14 +2192,14 @@ tcp_fastretrans_alert(struct sock *sk, u32 
prior_snd_una,
/* F. Process state. */
switch (icsk-icsk_ca_state) {
case TCP_CA_Recovery:
-   if (prior_snd_una == tp-snd_una) {
+   if (!(flag  FLAG_SND_UNA_ADVANCED)) {
if (IsReno(tp)  is_dupack)
tcp_add_reno_sack(sk);
} else {
int acked = prior_packets - tp-packets_out;
if (IsReno(tp))
tcp_remove_reno_sacks(sk, acked);
-   is_dupack = tcp_try_undo_partial(sk, acked);
+   do_lost = tcp_try_undo_partial(sk, acked);
}
break;
case TCP_CA_Loss:
@@ -2215,7 +2215,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
/* Loss is undone; fall through to processing in Open state. */
default:
if (IsReno(tp)) {
-   if (tp-snd_una != prior_snd_una)
+   if (flag  FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
if (is_dupack)
tcp_add_reno_sack(sk);
@@ -2264,7 +2264,7

Re: [PATCH net-2.6 1/2] [TCP]: Fix ratehalving with bidirectional flows

2007-07-31 Thread Ilpo Järvinen

On Tue, 31 Jul 2007, Stephen Hemminger wrote:

 I noticed no difference in the two flow tests.  That is not a bad thing, just
 that this test doesn't hit that code.

...I'm not too sure about your test setup but the bugs I fixed only cover 
cases that occur if flow is bidirectional (and obviously active in both 
directions at the same time), they won't occur in a case of unidirectional 
transfer or in request-reply style connections (well, in the latter 
case if there's some overlap, it can have effect but that's usually 
not significant)...

In case of bidirectional transfers, you *should* see some difference as 
previously the fast recovery was _very_ broken. Of course there could be 
other issue with large cwnd TCP that hides it by going to RTO still, but 
at least over 384k/200ms link (DBP sized buffers, IIRC), these change
behavior very dramatically, mainly in the initial slow-start overshoot 
recovery because in there losses per RTT is so high number compared to 
what is experienced later on. One or a few losses are usually recovered 
without RTO when congestion happens later on.

 The anomaly is that first flow does slow start then gets loss and ends up
 reducing it's window size all the way to the bottom, finally it recovers.
 This happens with Cubic, H-TCP and others as well; if the queue in the
 network is large enough, they don't handle the initial loss well.

...TCP related stuff that changed in /proc/net/netstat might shed 
some light to this if none of the given explinations please you... :-)

 See the graph.

What exactly do you mean by RENO in the title, I mean what's tcp_sack 
set to? There is occassionally a bit confusion in that respect in the 
terminology @ netdev, I've used to reno refering to non-SACK stuff 
elsewhere but in here that's not always the case... Usually it's possible 
to derive the correct interpretation from the context, but in this case 
I'm not too sure... :-)

What I often have often seen with non-SACK TCP is that initial slow-start 
exhausts even very large advertised window on high DBP link and then due 
to draining of ACK feedback, gets RTOed... That usually shows up as long 
lasting recovery where one segment per RTT is recovered and new data is 
being sent as duplicate ACKs arrive with nearly constant rate until the 
window limit is hit (but I cannot see such periond in the graph you 
posted, so I guess it's not the explanation in this case). And if your 
RENO refers to something with SACK, that's not going to explain it 
anyway.

...Another nasty one I know is RED+ECN, though I'd say it's a bit far 
fetched one, as ECN cannot be used nicely in retransmission, 
a retransmission gets dropped instead of marking if RED wanted to mark.
I guess that doesn't occur in your test case either?


-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-2.6 0/2] [TCP]: Fix bidirectional brokeness

2007-07-30 Thread Ilpo Järvinen

Hi Dave,

While testing frto with bidirection TCP a while (months) ago, I 
encountered time-seq graphs which made absolutely no sense as if 
recoveries only completed after RTO. As a result, I noticed that 
rate-halving has problem when a flow is bidirection but testing the
patch has been on my todo list for ages.  ...Finally, here it is...
...Please think the first one a bit because there might be some
corner cases with reno.

While testing it I came across with an additional issue that can
occur with bidirectional traffic. And, even better, the second fix
seems to also solves a third issue which affects both unidirectional
and bidirectional flows, though it's a marginal case (cumulative
ACK that causes a larger number of new SACKed skbs). I've never
seen that third one to occur but it's there if you get enough ACK
losses, subsequent ACKs (if one gets them) do solve it so that's
not a very bad issue to begin with...

I've verified these from time-seq graphs on top of tcp-2.6, which
had some additional (mostly cleanup and the rebase I promised you
earlier) though I added the tp-fackets_out  tp-reordering check
afterwards as it seems necessary to avoid going to lost marker too
often (wouldn't have had any effect in my test case anyway).

...Please consider to net-2.6 and to stable too.

These will generate you some hassle when you rebase tcp-2.6. Btw, you 
forgot to push tcp-2.6 out last time though I could assume it's
state... :-) In case you're going to now push it out, could you please
drop [TCP]: Remove num_acked0 checks from cong.ctrl mods pkts_acked
from it as it seems to be out of place in tcp-2.6, I can resubmit it
to net-2.6.24 when you open it (unless you want to put it to net-2.6 
directly as it's rather trivial one).


-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH net-2.6 1/2] [TCP]: Fix ratehalving with bidirectional flows

2007-07-30 Thread Ilpo Järvinen


Actually, the ratehalving seems to work too well, as cwnd is
reduced on every second ACK even though the packets in flight
remains unchanged. Recoveries in a bidirectional flows suffer
quite badly because of this, both NewReno and SACK are affected.

After this patch, rate halving is performed for ACK only if
packets in flight was supposedly changed too.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |   23 +--
 1 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 0163051..767f92c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1848,19 +1848,22 @@ static inline u32 tcp_cwnd_min(const struct sock *sk)
 }
 
 /* Decrease cwnd each second ack. */
-static void tcp_cwnd_down(struct sock *sk)
+static void tcp_cwnd_down(struct sock *sk, int flag)
 {
struct tcp_sock *tp = tcp_sk(sk);
int decr = tp-snd_cwnd_cnt + 1;
+   
+   if ((flagFLAG_FORWARD_PROGRESS) ||
+   (IsReno(tp)  !(flagFLAG_NOT_DUP))) {
+   tp-snd_cwnd_cnt = decr1;
+   decr = 1;
 
-   tp-snd_cwnd_cnt = decr1;
-   decr = 1;
+   if (decr  tp-snd_cwnd  tcp_cwnd_min(sk))
+   tp-snd_cwnd -= decr;
 
-   if (decr  tp-snd_cwnd  tcp_cwnd_min(sk))
-   tp-snd_cwnd -= decr;
-
-   tp-snd_cwnd = min(tp-snd_cwnd, tcp_packets_in_flight(tp)+1);
-   tp-snd_cwnd_stamp = tcp_time_stamp;
+   tp-snd_cwnd = min(tp-snd_cwnd, tcp_packets_in_flight(tp)+1);
+   tp-snd_cwnd_stamp = tcp_time_stamp;
+   }
 }
 
 /* Nothing was retransmitted or returned timestamp is less
@@ -2057,7 +2060,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
}
tcp_moderate_cwnd(tp);
} else {
-   tcp_cwnd_down(sk);
+   tcp_cwnd_down(sk, flag);
}
 }
 
@@ -2257,7 +2260,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 
if (is_dupack || tcp_head_timedout(sk))
tcp_update_scoreboard(sk);
-   tcp_cwnd_down(sk);
+   tcp_cwnd_down(sk, flag);
tcp_xmit_retransmit_queue(sk);
 }
 
-- 
1.5.0.6

[PATCH net-2.6 2/2] [TCP]: Bidir flow must not disregard SACK blocks for lost marking

2007-07-30 Thread Ilpo Järvinen


It's possible that new SACK blocks that should trigger new LOST
markings arrive with new data (which previously made is_dupack
false). In addition, I think this fixes a case where we get
a cumulative ACK with enough SACK blocks to trigger the fast
recovery (is_dupack would be false there too).

I'm not completely pleased with this solution because readability
of the code is somewhat questionable as 'is_dupack' in SACK case
is no longer about dupacks only but would mean something like
'lost_marker_work_todo' too... But because of Eifel stuff done
in CA_Recovery, the FLAG_DATA_SACKED check cannot be placed to
the if statement which seems attractive solution. Nevertheless,
I didn't like adding another variable just for that either... :-)

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |5 -
 1 files changed, 4 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 767f92c..cfe6ac7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2109,7 +2109,10 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una,
 {
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
-   int is_dupack = (tp-snd_una == prior_snd_una  !(flagFLAG_NOT_DUP));
+   int is_dupack = (tp-snd_una == prior_snd_una 
+(!(flagFLAG_NOT_DUP) ||
+ ((flagFLAG_DATA_SACKED) 
+  (tp-fackets_out  tp-reordering;
 
/* Some technical things:
 * 1. Reno does not count dupacks (sacked_out) automatically. */
-- 
1.5.0.6

Re: [PATCH net-2.6 1/2] [TCP]: Fix ratehalving with bidirectional flows

2007-07-30 Thread Ilpo Järvinen

On Mon, 30 Jul 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Mon, 30 Jul 2007 20:18:45 +0300 (EEST)

  Actually, the ratehalving seems to work too well, as cwnd is
  reduced on every second ACK even though the packets in flight
  remains unchanged. Recoveries in a bidirectional flows suffer
  quite badly because of this, both NewReno and SACK are affected.

  After this patch, rate halving is performed for ACK only if
  packets in flight was supposedly changed too.

  Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]

 This fix looks great, and I see no potential problems with
 Reno after doing a bit of auditing.

 Applied and I'll push to -stable too.

I think it's probably good to add tp-snd_una != prior_snd_una
check there too... It's not going to make a large difference,
mostly just to be conservative when skb collapse stuff got done
(and maybe to annoy cheaters too though I couldn't at this point
figure out how they could abuse it)...

...I think I can come up with that on Wednesday, so please hold
stable push until that.

-- 
 i.

Re: [PATCH net-2.6 2/2] [TCP]: Bidir flow must not disregard SACK blocks for lost marking

2007-07-30 Thread Ilpo Järvinen

On Mon, 30 Jul 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Mon, 30 Jul 2007 20:19:40 +0300 (EEST)

  I'm not completely pleased with this solution because readability
  of the code is somewhat questionable as 'is_dupack' in SACK case
  is no longer about dupacks only but would mean something like
  'lost_marker_work_todo' too... But because of Eifel stuff done
  in CA_Recovery, the FLAG_DATA_SACKED check cannot be placed to
  the if statement which seems attractive solution. Nevertheless,
  I didn't like adding another variable just for that either... :-)

 I don't mind the complex conditionals so much in loss
 handling, they are almost inevitable. However I believe
 they could be simplified as a lot of pieces of code ask
 similar if not identical questions.

 We could ask several of these things up-front, regardless
 of path we will take (reno, DSACK, reorder, FRTO, etc.)
 and pass the answers along in a bitmask.  We do that to
 some extent already with how we analyze the retransmit
 queue at the beginning of ACK processing.

That's true, ...and my thought too as I was thinking of
adding FLAG_SND_UNA_ADVANCED (!= FLAG_DATA_ACKED) (or it's
opposite as I haven't yet checked which way it is more
useful). That could help in many places and probably reduces
compare pressure here and there as the bitmask operation
can be done in one compare where it now has to do two
compares, one for snd_una and other for the flag.

...I'll keep an eye on other possible bits too... :-)

-- 
 i.

Re: TCP SACK issue, hung connection, tcpdump included

2007-07-29 Thread Ilpo Järvinen

On Sun, 29 Jul 2007, Willy Tarreau wrote:

 On Sun, Jul 29, 2007 at 06:59:26AM +0100, Darryl L. Miles wrote:
  CLIENT = Linux 2.6.20.1-smp [Customer build]
  SERVER = Linux 2.6.9-55.ELsmp [Red Hat Enterprise Linux AS release 4 
  (Nahant Update 5)]
  
  The problems start around time index 09:21:39.860302 when the CLIENT issues 
  a TCP packet with SACK option set (seemingly for a data segment which has 
  already been seen) from that point on the connection hangs.

...That's DSACK and it's being correctly sent. To me, it seems unlikely to 
be the cause for this breakage...

 Where was the capture taken ? on CLIENT or on SERVER (I suspect client from
 the timers) ? 

...I would guess the same based on SYN timestamps (and from the DSACK 
timestamps)...

 A possible, but very unlikely reason would be an MTU limitation
 somewhere, because the segment which never gets correctly ACKed is also the
 largest one in this trace.

Limitation for 48 byte segments? You have to be kidding... :-) But yes,
it seems that one of the directions is dropping packets for some reason 
though I would not assume MTU limitation... Or did you mean some other 
segment?

 Also, if everything is right with the packets on the server side, then
 it would means that it's the RHEL kernel which is buggy (which does not
 mean that the same bug does not exist in mainline).

...There are two independed signs that server side is not getting the 
packets (ack field is not advanced nor do the retransmissions of an old 
data cease) so IMHO that would the first thing to rule out.

  Full dump available via email.
  
  
  09:12:13.444999 IP CLIENT.50727  SERVER.ssh: S 2919512080:2919512080(0) 
  win 5840 mss 1460,sackOK,timestamp 799860259 0,nop,wscale 7
  09:12:13.535643 IP SERVER.ssh  CLIENT.50727: S 492909547:492909547(0) ack 
  2919512081 win 5792 mss 1460,sackOK,timestamp 7126976 799860259,nop,wscale 
  2
  09:12:13.535717 IP CLIENT.50727  SERVER.ssh: . ack 1 win 46 
  nop,nop,timestamp 799860282 7126976
  09:12:13.665481 IP SERVER.ssh  CLIENT.50727: P 1:24(23) ack 1 win 1448 
  nop,nop,timestamp 7127074 799860282
  09:12:13.665895 IP CLIENT.50727  SERVER.ssh: . ack 24 win 46 
  nop,nop,timestamp 799860314 7127074
  09:12:13.666044 IP CLIENT.50727  SERVER.ssh: P 1:23(22) ack 24 win 46 
  nop,nop,timestamp 799860314 7127074
  09:12:13.776318 IP SERVER.ssh  CLIENT.50727: . ack 23 win 1448 
  nop,nop,timestamp 7127216 799860314
  
  ...SNIPPED SUCCESSFUL TRAFFIC...
  
  09:21:39.490740 IP SERVER.ssh  CLIENT.50727: P 18200:18464(264) ack 2991 
  win 2728 nop,nop,timestamp 7692910 81727
  09:21:39.490775 IP CLIENT.50727  SERVER.ssh: . ack 18464 win 378 
  nop,nop,timestamp 81755 7692910
  09:21:39.860245 IP SERVER.ssh  CLIENT.50727: . 12408:13856(1448) ack 2991 
  win 2728 nop,nop,timestamp 7693293 81749
  09:21:39.860302 IP CLIENT.50727  SERVER.ssh: . ack 18464 win 378 
  nop,nop,timestamp 81847 7692910,nop,nop,sack sack 1 {12408:13856} 
  09:21:40.453440 IP SERVER.ssh  CLIENT.50727: . 12408:13856(1448) ack 2991 
  win 2728 nop,nop,timestamp 7693887 81749
  09:21:40.453495 IP CLIENT.50727  SERVER.ssh: . ack 18464 win 378 
  nop,nop,timestamp 81996 7692910,nop,nop,sack sack 1 {12408:13856} 
  09:21:41.641821 IP SERVER.ssh  CLIENT.50727: . 12408:13856(1448) ack 2991 
  win 2728 nop,nop,timestamp 7695075 81749
  09:21:41.641938 IP CLIENT.50727  SERVER.ssh: . ack 18464 win 378 
  nop,nop,timestamp 82293 7692910,nop,nop,sack sack 1 {12408:13856} 
  09:21:44.017552 IP SERVER.ssh  CLIENT.50727: . 12408:13856(1448) ack 2991 
  win 2728 nop,nop,timestamp 7697451 81749
  09:21:44.017622 IP CLIENT.50727  SERVER.ssh: . ack 18464 win 378 
  nop,nop,timestamp 82887 7692910,nop,nop,sack sack 1 {12408:13856} 
  09:21:48.770051 IP SERVER.ssh  CLIENT.50727: . 12408:13856(1448) ack 2991 
  win 2728 nop,nop,timestamp 7702203 81749
  09:21:48.770099 IP CLIENT.50727  SERVER.ssh: . ack 18464 win 378 
  nop,nop,timestamp 84075 7692910,nop,nop,sack sack 1 {12408:13856} 
  09:21:58.274061 IP SERVER.ssh  CLIENT.50727: . 12408:13856(1448) ack 2991 
  win 2728 nop,nop,timestamp 7711707 81749
  09:21:58.274180 IP CLIENT.50727  SERVER.ssh: . ack 18464 win 378 
  nop,nop,timestamp 86450 7692910,nop,nop,sack sack 1 {12408:13856} 
  09:22:17.282035 IP SERVER.ssh  CLIENT.50727: . 12408:13856(1448) ack 2991 
  win 2728 nop,nop,timestamp 7730715 81749
  09:22:17.282153 IP CLIENT.50727  SERVER.ssh: . ack 18464 win 378 
  nop,nop,timestamp 800011202 7692910,nop,nop,sack sack 1 {12408:13856} 
  09:22:55.298955 IP SERVER.ssh  CLIENT.50727: . 12408:13856(1448) ack 2991 
  win 2728 nop,nop,timestamp 7768731 81749
  09:22:55.299023 IP CLIENT.50727  SERVER.ssh: . ack 18464 win 378 
  nop,nop,timestamp 800020705 7692910,nop,nop,sack sack 1 {12408:13856} 
  09:24:11.329853 IP SERVER.ssh  CLIENT.50727: . 12408:13856(1448) ack 2991 
  win 2728 nop,nop,timestamp 7844763 81749
  09:24:11.329923 IP CLIENT.50727  SERVER.ssh: . ack

Re: TCP SACK issue, hung connection, tcpdump included

2007-07-29 Thread Ilpo Järvinen

On Sun, 29 Jul 2007, Willy Tarreau wrote:

 On Sun, Jul 29, 2007 at 11:26:00AM +0300, Ilpo Järvinen wrote:
  On Sun, 29 Jul 2007, Willy Tarreau wrote:
  
   On Sun, Jul 29, 2007 at 06:59:26AM +0100, Darryl L. Miles wrote:
CLIENT = Linux 2.6.20.1-smp [Customer build]
SERVER = Linux 2.6.9-55.ELsmp [Red Hat Enterprise Linux AS release 4 
(Nahant Update 5)]

The problems start around time index 09:21:39.860302 when the CLIENT 
issues 
a TCP packet with SACK option set (seemingly for a data segment which 
has 
already been seen) from that point on the connection hangs.
  
  ...That's DSACK and it's being correctly sent. To me, it seems unlikely to 
  be the cause for this breakage...
  
   Where was the capture taken ? on CLIENT or on SERVER (I suspect client 
   from
   the timers) ? 
  
  ...I would guess the same based on SYN timestamps (and from the DSACK 
  timestamps)...
  
   A possible, but very unlikely reason would be an MTU limitation
   somewhere, because the segment which never gets correctly ACKed is also 
   the
   largest one in this trace.
  
  Limitation for 48 byte segments? You have to be kidding... :-) But yes,
  it seems that one of the directions is dropping packets for some reason 
  though I would not assume MTU limitation... Or did you mean some other 
  segment?
 
 No, I was talking about the 1448 bytes segments. But in fact I don't
 believe it much because the SACKs are always retransmitted just afterwards.

Ah, but it's ACKed correctly right below it...:

[...snip...]
09:21:39.490740 IP SERVER.ssh  CLIENT.50727: P 18200:18464(264) ack 
2991 
win 2728 nop,nop,timestamp 7692910 81727
09:21:39.490775 IP CLIENT.50727  SERVER.ssh: . ack 18464 win 378 
nop,nop,timestamp 81755 7692910
09:21:39.860245 IP SERVER.ssh  CLIENT.50727: . 12408:13856(1448) ack 
2991 
win 2728 nop,nop,timestamp 7693293 81749

...segment below snd_una arrived = snd_una remains 18464, receiver 
generates a duplicate ACK:
 
09:21:39.860302 IP CLIENT.50727  SERVER.ssh: . ack 18464 win 378 
nop,nop,timestamp 81847 7692910,nop,nop,sack sack 1 {12408:13856} 

The cumulative ACK field of it covers _everything_ below 18464 (i.e., it 
ACKs them), including the 1448 bytes in 12408:13856... In addition, the 
SACK block is DSACK information [RFC2883] telling explicitly the address 
of the received duplicate block. However, if this ACK doesn't reach the 
SERVER TCP, RTO is triggered and the first not yet cumulatively ACKed 
segment is retransmitted (I guess cumulative ACKs up to 12408 arrived 
without problems to the SERVER):

09:21:40.453440 IP SERVER.ssh  CLIENT.50727: . 12408:13856(1448) ack 
2991 
win 2728 nop,nop,timestamp 7693887 81749
[...snip...]

 BTW, some information are missing. It would have been better if the trace
 had been read with tcpdump -Svv. We would have got seq numbers and ttl.
 Also, we do not know if there's a firewall between both sides. Sometimes,
 some IDS identify attacks in crypted traffic and kill connections. It
 might have been the case here, with the connection closed one way on an
 intermediate firewall.

Yeah, firewall or some other issue, I'd say it's quite unlikely a bug in 
TCP because behavior to both directions indicate client - sender 
blackhole independently of each other...


-- 
 i.

Re: TCP SACK issue, hung connection, tcpdump included

2007-07-29 Thread Ilpo Järvinen

On Sun, 29 Jul 2007, Willy Tarreau wrote:

 On Sun, Jul 29, 2007 at 12:28:04PM +0300, Ilpo Järvinen wrote:
 
  [...snip...]
  
   BTW, some information are missing. It would have been better if the trace
   had been read with tcpdump -Svv. We would have got seq numbers and ttl.
   Also, we do not know if there's a firewall between both sides. Sometimes,
   some IDS identify attacks in crypted traffic and kill connections. It
   might have been the case here, with the connection closed one way on an
   intermediate firewall.
  
  Yeah, firewall or some other issue, I'd say it's quite unlikely a bug in 
  TCP because behavior to both directions indicate client - sender 
  blackhole independently of each other...
 
 It would also be possible that something stupid between both ends simply
 drops packets with the SACK option set. Also something which sometimes
 happen is that some firewalls automatically translate sequence numbers
 but not necessarily SACK values, which could pretty well lead to this
 packet being received but ignored on the server side.

...One can toggle those off with /proc/sys/net/ipv4/tcp_dsack but I 
suspect DSACKs are not the cause because these retransmissions neither
are making it through (there are many of them also earlier in the log, 
just quoted the easiest ones :-) ):

09:36:44.335591 IP CLIENT.50727  SERVER.ssh: P 2991:3039(48) ack 18464 
win 
378 nop,nop,timestamp 800227942 7692910
09:38:44.351950 IP CLIENT.50727  SERVER.ssh: P 2991:3039(48) ack 18464 
win 
378 nop,nop,timestamp 800257942 7692910
09:40:44.368172 IP CLIENT.50727  SERVER.ssh: P 2991:3039(48) ack 18464 
win 
378 nop,nop,timestamp 800287943 7692910

...there are no SACKs involved in them, yet no cumulative ACK ever 
arrives from SERVER...


-- 
 i.

Re: [RFC 2/3] TCP: cubic - eliminate use of receive time stamp

2007-07-19 Thread Ilpo Järvinen

Hi all,

It's definately better to do rtt measurements in the proper callback
like you're suggesting since the call to cong_avoid depends on too many
factors (ECN, FRTO) that shouldn't inhibit rtt sampling though they,
for a period, determine how cwnd must be set...

-- 
 i.

On Thu, 19 Jul 2007, Stephen Hemminger wrote:

 Remove use of received timestamp option value from RTT calculation in Cubic.
 A hostile receiver may be returning a larger timestamp option than the 
 original
 value. This would cause the sender to believe the malevolent receiver had
 a larger RTT and because Cubic tries to provide some RTT friendliness, the
 sender would then favor the liar.
 
 Instead, use the jiffie resolutionRTT value already computed and
 passed back after ack.
 
 Signed-off-by: Stephen Hemminger [EMAIL PROTECTED]
 
 --- a/net/ipv4/tcp_cubic.c2007-07-19 08:21:42.0 +0100
 +++ b/net/ipv4/tcp_cubic.c2007-07-19 08:24:58.0 +0100
 @@ -246,38 +246,12 @@ static inline void bictcp_update(struct 
   ca-cnt = 1;
  }
  
 -
 -/* Keep track of minimum rtt */
 -static inline void measure_delay(struct sock *sk)
 -{
 - const struct tcp_sock *tp = tcp_sk(sk);
 - struct bictcp *ca = inet_csk_ca(sk);
 - u32 delay;
 -
 - /* No time stamp */
 - if (!(tp-rx_opt.saw_tstamp  tp-rx_opt.rcv_tsecr) ||
 -  /* Discard delay samples right after fast recovery */
 - (s32)(tcp_time_stamp - ca-epoch_start)  HZ)
 - return;
 -
 - delay = (tcp_time_stamp - tp-rx_opt.rcv_tsecr)3;
 - if (delay == 0)
 - delay = 1;
 -
 - /* first time call or link delay decreases */
 - if (ca-delay_min == 0 || ca-delay_min  delay)
 - ca-delay_min = delay;
 -}
 -
  static void bictcp_cong_avoid(struct sock *sk, u32 ack,
 u32 seq_rtt, u32 in_flight, int data_acked)
  {
   struct tcp_sock *tp = tcp_sk(sk);
   struct bictcp *ca = inet_csk_ca(sk);
  
 - if (data_acked)
 - measure_delay(sk);
 -
   if (!tcp_is_cwnd_limited(sk, in_flight))
   return;
  
 @@ -337,14 +311,30 @@ static void bictcp_state(struct sock *sk
  static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us)
  {
   const struct inet_connection_sock *icsk = inet_csk(sk);
 + struct bictcp *ca = inet_csk_ca(sk);
 + u32 delay;
  
   if (cnt  0  icsk-icsk_ca_state == TCP_CA_Open) {
 - struct bictcp *ca = inet_csk_ca(sk);
   cnt -= ca-delayed_ack  ACK_RATIO_SHIFT;
   ca-delayed_ack += cnt;
   }
 -}
  
 + /* Some calls are for duplicates without timetamps */
 + if (rtt_us  0)
 + return;
 +
 + /* Discard delay samples right after fast recovery */
 + if ((s32)(tcp_time_stamp - ca-epoch_start)  HZ)
 + return;
 +
 + delay = usecs_to_jiffies(rtt_us)  3;
 + if (delay == 0)
 + delay = 1;
 +
 + /* first time call or link delay decreases */
 + if (ca-delay_min == 0 || ca-delay_min  delay)
 + ca-delay_min = delay;
 +}
  
  static struct tcp_congestion_ops cubictcp = {
   .init   = bictcp_init,
 
 


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Linux 2.6.22: Leak r=1 1

2007-07-18 Thread Ilpo Järvinen

On Wed, 11 Jul 2007, Sami Farin wrote:

 That's right, so descriptive is the new Linux kernel 2.6.22.
 
 Linux safari.finland.fbi 2.6.22-cfs-v19 #3 SMP Tue Jul 10 00:22:25 EEST 2007 
 i686 i686 i386 GNU/Linux
 
 [EMAIL PROTECTED] /proc/sys/net/ipv4]# grep . *

...snip...

 tcp_frto:1

...This is fully unrelated to the issue but I'm a bit curious who enabled 
frto on your machine (since it's disabled by default), did you do it by 
yourself or the distribution perhaps?

This is interesting because frto to be useful in large scale, sender must 
have it enabled and therefore it's usually not under control of the host 
that is attached to the wireless access link. ...In case some 
distribution (yours) is already enabling it, the deployment is somewhat 
proceeding already :-)... ...And we also get incremental testing of the 
code too...

-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: Linux 2.6.22: Leak r=1 1

2007-07-12 Thread Ilpo Järvinen

On Wed, 11 Jul 2007, Sami Farin wrote:

 That's right, so descriptive is the new Linux kernel 2.6.22.
 Took a while to grep what is leaking.
 
 Linux safari.finland.fbi 2.6.22-cfs-v19 #3 SMP Tue Jul 10 00:22:25 EEST 2007 
 i686 i686 i386 GNU/Linux
 
 Just normal Internet usage, azureus for example =)
 I think this is easy to trigger.

I guess those packet loss periods help you to reproduce it so easily.

 # ss -n|wc -l
 870
 
 # ping -A 80.223.96.1 
 PING 80.223.96.1 (80.223.96.1) 56(84) bytes of data.
 64 bytes from 80.223.96.1: icmp_seq=1 ttl=255 time=431 ms
 ...
 --- 80.223.96.1 ping statistics ---
 40 packets transmitted, 25 received, 37% packet loss, time 17954ms
 rtt min/avg/max/mdev = 406.000/467.758/530.983/29.384 ms, pipe 2, ipg/ewma 
 460.361/456.381 ms
 
 But ploss is only temporary (when I am downloading with azureus =) ,
 when only uploading (95% of bandwidth used) rtt avg = 32ms).
 
 # dmesg|grep Leak
 [114992.191011] Leak r=1 4
 [124231.713348] Leak r=1 4
 [142807.938284] Leak r=1 4
 [142999.674521] Leak r=1 1
 [143177.462073] Leak r=1 4
 [143230.001570] Leak r=1 4
 [143232.982560] Leak r=1 4
 [143234.537096] Leak r=1 4
 [143297.927760] Leak r=1 4
 [143300.633603] Leak r=1 4
 [143302.172917] Leak r=1 4
 [143357.083193] Leak r=1 1
 [143361.780879] Leak r=1 4
 [143413.706490] Leak r=1 4
 [143552.996598] Leak r=1 1
 
 [EMAIL PROTECTED] /proc/sys/net/ipv4]# grep . *
[...snip...]

 tcp_frto:1

I suspect this is the main ingrediment to trigger these leaks, well, 
I'm pretty sure of... Sami, please test the patch included below, 
Dave can then put that one to net-2.6 and to stable too.

This is sort of poking to dark still... But it's pretty much the only 
place where the SACKED_RETRANS bit is touched without checking first that 
the adjustment can safely be made (and all SACKED_RETRANS changes in 
2.6.22 are FRTO related as well). Most likely something cleared 
SACKED_RETRANS bit underneath FRTO and in tcp_enter_frto_loss I just 
blindly assumed that it's still there.

While the patch below probably works, and leaks are no more, I'd like to 
get bottom of this by really figuring out what caused the SACKED_RETRANS 
bit to get cleared in the first place (wasn't expecting this happen while 
I wrote the FRTO). I guess that it could be lost retransmit loop in 
sacktag but again I've no concrete proof for that yet. Because for that to
trigger, something must have allowed sending skbs past the snd_nxt at the 
time of the RTO, which too must be prevented during FRTO! Thus there could 
be other issues while this is just a sympthom of the main problem.

I'd be interested to study some tcpdumps that relate to Leak cases you're 
seeing. Could you record some Sami? I'm not sure though how one can figure 
out the timestamp relation between the kernel log and a tcpdump log... 
Anyway, for this debugging, you should use a debug version of this patch 
with WARN_ON to get exact timestamp of the event since the leak print may 
occur much later on, I put one available at 
http://www.cs.helsinki.fi/u/ijjarvin/patches/ .


Other candidates for the cause are even less likely. The first two are 
self-standing so that this patch is going to be necessary as long as 
fuzzy SACK blocks are allowed to be received and processed in sacktag 
(regardless there turns to be additional problem triggering this one or 
not):
 - DSACK touching snd_una (receiver is pretty inconsistent with itself 
   because snd_una wasn't advanced).
 - 2xRTO and SACK @ snd_una (same note as above)
 - snd_una advanced full skb _without_ FLAG_DATA_ACKED being set 
   (unlikely)
 - Head not retransmitted on RTO when FRTO was enabled (no, it's not
   going to be this one)
It cannot be double SACKED_RETRANS because another msg would be printed
already in tcp_retransmit_skb.

Anyway, I'll be offline over the weekend starting from the Friday
morning.

...thanks for your report Sami.

-- 
 i.



[PATCH] [TCP]: Verify the presence of RETRANS bit when leaving FRTO

For yet unknown reason, something cleared SACKED_RETRANS bit
underneath FRTO.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 69f9f1e..4e5884a 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1398,7 +1398,9 @@ static void tcp_enter_frto_loss(struct sock *sk, int 
allowed_segments, int flag)
 * waiting for the first ACK and did not get it)...
 */
if ((tp-frto_counter == 1)  !(flagFLAG_DATA_ACKED)) {
-   tp-retrans_out += tcp_skb_pcount(skb);
+   /* For some reason this R-bit might get cleared? */
+   if (TCP_SKB_CB(skb)-sacked  TCPCB_SACKED_RETRANS)
+   tp-retrans_out += tcp_skb_pcount(skb);
/* ...enter this if branch just for the first segment */
flag

Re: Linux 2.6.22: Leak r=1 1

2007-07-12 Thread Ilpo Järvinen

On Thu, 12 Jul 2007, Sami Farin wrote:

 On Thu, Jul 12, 2007 at 10:53:57 +0300, Ilpo Järvinen wrote:
  On Wed, 11 Jul 2007, Sami Farin wrote:
  
   That's right, so descriptive is the new Linux kernel 2.6.22.
   Took a while to grep what is leaking.
   
   Linux safari.finland.fbi 2.6.22-cfs-v19 #3 SMP Tue Jul 10 00:22:25 EEST 
   2007 i686 i686 i386 GNU/Linux
   
   Just normal Internet usage, azureus for example =)
   I think this is easy to trigger.
  
  I guess those packet loss periods help you to reproduce it so easily.
 ...
  I'd be interested to study some tcpdumps that relate to Leak cases you're 
  seeing. Could you record some Sami? I'm not sure though how one can figure 
 
 I now have 300 MB capture and several newretarded music videos...
 And 10 WARNINGs and 0 Leak printk's.

Thanks, every warning would have lead to a Leak print later on (not 
necessarily with 1-to-1 relation but every pending warning would be 
acknowledged by a single Leak print). So every time the WARNING 
triggers, inconsistency would have been result without the patch.

 2007-07-12 12:03:18.910712500 4[ 1318.606826] WARNING: at 
 net/ipv4/tcp_input.c:1402 tcp_enter_frto_loss()
...snip...
 
 This is MAYBE the guilty connection if timestamps are to be believed:
 
...snip...

I think you got the correct connection... Thanks. The problem seems to
be related to FIN (a case that wouldn't have occurred to me without your 
log, thanks again :-))... I think that the patch I suggested should be 
fine (and it fixes the fuzzy sack block issues as well) though I still 
have problem in figuring out what's the exact path of execution on each 
ACK near the end of the connection (the sent packets are misplaced in the 
shown dump but the original order can be reconstructed from IP identifiers 
and TCP timestamps).

 Well, haven't gotten Leaks anymore after applying the patch.

I'd have been a bit surprised if they would have still been there with 
the patch...

-- 
 i.

Re: [PATCH 2.6.22] TCP: Make TCP_RTO_MAX a variable (take 2)

2007-07-12 Thread Ilpo Järvinen

On Thu, 12 Jul 2007, Rick Jones wrote:

  One question is why the RTO gets so large that it limits failover?
  
  If Linux TCP is working correctly,  RTO should be srtt + 2*rttvar
  
  So either there is a huge srtt or variance, or something is going
  wrong with RTT estimation.  Given some reasonable maximums of
  Srtt = 500ms and rttvar = 250ms, that would cause RTO to be 1second.
 
 I suspect that what is happening here is that a link goes down in a trunk
 somewhere for some number of seconds, resulting in a given TCP segment being
 retransmitted several times, with the doubling of the RTO each time.

But that's a back-off for the retransmissions, the doubling is 
temporary... Once you return to normal conditions, the accumulated backoff 
multiplier will be immediately cut back to normal. So you should then be 
back to 1 second (like in the example or whatever) again...

-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] [NET]: Another unnecessary net/tcp.h inclusion in net/dn.h

2007-07-08 Thread Ilpo Järvinen


No longer needed.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/net/dn.h |1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/include/net/dn.h b/include/net/dn.h
index ac4ce90..6277783 100644
--- a/include/net/dn.h
+++ b/include/net/dn.h
@@ -3,7 +3,6 @@
 
 #include linux/dn.h
 #include net/sock.h
-#include net/tcp.h
 #include asm/byteorder.h
 
 #define dn_ntohs(x) le16_to_cpu(x)
-- 
1.5.0.6

[PATCH] [BNX2]: Seems to not need net/tcp.h

2007-07-07 Thread Ilpo Järvinen


Got bored to always recompile it for no reason.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 drivers/net/bnx2.c |1 -
 1 files changed, 0 insertions(+), 1 deletions(-)

diff --git a/drivers/net/bnx2.c b/drivers/net/bnx2.c
index 88b33c6..56244f6 100644
--- a/drivers/net/bnx2.c
+++ b/drivers/net/bnx2.c
@@ -40,7 +40,6 @@
 #define BCM_VLAN 1
 #endif
 #include net/ip.h
-#include net/tcp.h
 #include net/checksum.h
 #include linux/workqueue.h
 #include linux/crc32.h
-- 
1.5.0.6

Re: [PATCH] [TCP]: Fix logic breakage due to DSACK separation

2007-07-03 Thread Ilpo Järvinen

On Mon, 2 Jul 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Sat, 16 Jun 2007 02:04:25 +0300 (EEST)

  There are still some things I must think carefully in sacktag processing 
  since it does not validate start_seq and end_seq at all which can be 
  abused currently at least in tcp-2.6. ...I would rather put end to the 
  whole russian roulette in tcp-2.6 sacktag rather than fix/think individual 
  cases and leave future modifications of it similarily hazardous. It's not 
  very clear to me how to handle all possible cases of invalid SACK blocks 
  though, perhaps TCP should just ignore such sack blocks without doing 
  any processing based on them, i.e., ignore them whenever start_seq-end_seq 
  does not fully fit to snd_una-snd_nxt (expect DSACK of course, which 
  should be processed if it's between undo_marker-snd_nxt). Do you have any 
  thoughts about this?

 I agree.  This is a problem that basically every single TCP stack out
 there right now is vulnerable to, lots of cpu processing for invalid
 or maliciously created SACK blocks. This is why I even considered the 
 RB-Tree stuff at all.

Yes, without RB-tree TCP is quite much out of luck. However, as long as 
there is depency to skb's fack_count in sacktag, there seems to be some 
room for cpu processing attacks even with RB-tree case because walk 
becomes necessary on the slow path. I've thought about three solutions to 
this, each has some pros  cons so I'm not that sure which path should be 
pursued:

1. fack_count in skb (your proposal months ago)
   + Trivial arithmetics, no walking necessary to find it (ever)
   - Very expensive in storage wise (like you have stated earlier)

2. fackets_out removal like I presented earlier (which you seem to have 
   looked by now). It was really RFC and was somewhat in state of flux 
   still
   + Fast path is fully arithmetics based
   + Saves little (extra) space in tcp_sock too
   + Its effect for one directional flow without nagle is quite
 insignificant I would say (in friendly environment).
   - But alas, I've noticed that any one directional flow could be
 forced to bidirectional slow path by sending some dummy data with 
 gaps (in malicious intend), then the sender would use SACKs to
 inform about those losses whenever data is being send, and if 
 window is large enough TCP would be destined to slow path... :-( How 
 easy it is, i.e., with how small window the sent SACKs exceed MSS 
 size which is the fastpath/slowpath selector depends largely on how 
 large fraction the SACK blocks are in bytes of MSS, with 9000 MTU 
 it's perhaps no longer so easy to do is large scale... Thus it's not 
 anymore sender controllable by nagle setting, like I thought earlier, 
 and therefore it's not that attractive anymore.
   * malicious tcp_fragment forcing would have to be also closed before 
 this is of any use. It seems to be easy to do by adding 
 WAS_COLLAPSED bit to skb-sacked (has some unused bits) and ignoring 
 middle skb SACKs too. IIRC you were (almost fully) for this already 
 in the past? Haven't checked but it should be quite easy to do, and 
 it is something I'll probably do anyway...

3. seqno based tp-reordering
   + Trivial to calculate now that there is highest_sack
   - No longer would the pkt counting be fully pkts based. Not that bad
 in the case of no Nagle though (which is fully under sender's
 control :-)). Equal behavior can be estimated by doing rounding
 to MSS size though (requires some divides though), for no-nagle 
 case it could do exactly the same if I have thought it correctly.
   - I'd expect some small skb user will be complaining due to performance
 loss (recovery latency increasing), perhaps we could add sysctl for
 enabling expensive processing for them. Or just try without it and
 see what happens, nobody would notice I guess though it will 
 probably surface later on when somebody is first time looking 
 numbers (and is not pleased)... ...And would then of course claim 
 that 2.6.3x (or something later) has a regression against 
 2.6.(3x-1)... :-) I think that the expensive processing stuff can 
 reuse some work I've already done for the alternative 2 earlier to
 avoid walk-always in case of a flow with normal usage pattern on
 the same host.

...I'd say that alternative 3 is most attractive of these... But due to 
small skb recovery affecting change I'm not too sure if it's desired?

With RB-tree things get really interesting since it seems that TCP has 
some chance... :-) There are many things still to handle but it's getting 
closer already after SACK validation is in tcp-2.6. One thing I would be 
happy avoid is the full walk that is necessary when SACK reneging is 
detected... It is already much harder to do in tcp-2.6 and at least the 
amount of work decreases compared to the last reneging (if it was in the 
same

Re: [PATCH 8/9] [TCP]: Reduce sacked_out with reno when purging write_queue

2007-07-03 Thread Ilpo Järvinen

On Mon, 2 Jul 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Sat, 26 May 2007 11:36:01 +0300

  Previously TCP had a transitional state during which reno
  counted segments that are already below the current window into
  sacked_out, which is now prevented. Re-try now unconditional
  S+L catching (I wonder if we could get that BUG_ON place
  pinpointed more exactly in oops because now inlining makes it
  lose its original context as they always seem to be in
  tcp_ack, #define helps?).

  Beware, this change is not a trivial one and might have some
  unexpected side-effects under obscure conditions since state
  tracking is to happen much later on and the reno sack counting
  was highly depending on the current state.

  This approach conservatively calls just remove_sack and leaves
  reset_sack() calls alone. The best solution to the whole problem
  would be to first calculate the new sacked_out fully (this patch
  does not move reno_sack_reset calls from original sites and thus
  does not implement this). However, that would require very
  invasive change to fastretrans_alert (perhaps even slicing it to
  two halves). Alternatively, all callers of tcp_packets_in_flight
  (i.e., users that depend on sacked_out) should be postponed
  until the new sacked_out has been calculated but it isn't any
  simpler alternative.

  Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]

 So basically the idea behind this patch is to do the update of
 the fake RENO sakcs in clean_rtx_queue instead of fixing it up
 at the very last moment right before we invoke tcp_try_to_undo_partial().

Yeap, it would change sacked_out that things are seeing before 
undo_partial point (if there would be some) but it would qualify IMHO 
as fix for those case too rather than bug. I checked that F-RTO (with 
reno) cannot ever access stale sacked_out because of other constraints 
based on ACK's snd_una, and that's pretty much what's being done between 
there.

TCP might still do another update later on by using tcp_reset_reno_sack, 
but that's always going to more aggressive update (or at least an equal 
one).

 I like this patch and I can't find any holes in the idea.

 But some things have changed in the meantime and this patch
 (and probably 9/9 too) don't apply cleanly.  Could you respin
 these against current tcp-2.6 so I can apply them?

I knew that :-), it was postponed due to other, more important issues and 
because the trees then got some depencies at that point I decided it's 
better to wait for a while until tcp-2.6 gets all stuff that's to be in 
net-2.6... But now that things seem to have settled, I'll provide the 
updated one later on. The 9/9 should be dropped, I've noticed a cpu 
processing vulnerability in it which I previously didn't see there (I 
describe it in my other post).

-- 
 i.

Re: [RFC PATCH tcp-2.6 1/2] [TCP]: Discard fuzzy SACK blocks

2007-06-19 Thread Ilpo Järvinen

On Mon, 18 Jun 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Tue, 19 Jun 2007 01:25:57 +0300

  From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

  SACK processing code has been sort of russian roulette as no
  validation of SACK blocks is previously attempted. It is not
  very clear what all kinds of broken SACK blocks really mean
  (e.g., one that has start and end sequence numbers reversed).

  This fixes also one remote triggerable NULL-ptr access:
  start_seq was trusted as a valid mark_lost_entry_seq but
  without validation it is relatively easy to inject an invalid
  sequence number there that will cause a crash in the lost
  marker code. Other SACK processing code seems safe so this could
  have been fixed locally but that would leave SACK processing
  hazardous in case of future modifications. IMHO, it's just
  better to close the whole roulette rather than disarming just
  one bullet.

  Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]

 This looks good applied.

 Does mainline 2.6.x has this NULL-ptr issues too? If so
 we'll have to fix it there very soon.

The null-ptr issue is related to the new lost marker code which isn't
in the mainline (I have even tried to state this couple of times in the 
earlier posts so that you could have less worries as I understand that 
it's your highest priority concern :-)). I.e, the mainline lost marker 
code scans queue by using tcp_for_write_queue() or starts from the hint 
it has learned earlier, it doesn't have to trust any sequences that are
given by the peer in SACK block.

I'll try to explain this once again: Because the new lost marker 
fastpath blindly trusted start_seq from received SACK, it could be 
abused to search for a sequence number for which tcp_write_queue_find 
returns NULL (happens when the searched sequence is outside of current 
window).

Another trouble would nowadays occur if start_seq == snd_una because 
write_queue_find is now given start_seq - 1 (which is fully intentional, 
no need to find skb at start_seq but one below it). I disallowed SACK 
blocks starting from snd_una partly due to that, and partly due to it's 
non-sense interpretation. It would not end up to the lost marker though 
because sack reneging detection intervenes.

Obviously I would have alerted you and security folks if there would have 
been this problem in mainline  stable too (and probably had done a valid 
patch to those trees at the first time). In case you still want to verify 
mainline by yourself, you can see where start_seq and end_seq goes in 
mainline's sacktag (not very hard to see :-)). Like I've explained 
earlier, only serious suspect seems to be the calculation of pkt_len that 
is input to tcp_fragment. Other cases are relatively trivial. However, 
analytical analysis of pkt_len setting took me considerable amount of time 
due to vast number of negations which were able to defeat my brains at 
least partly. Therefore I really wouldn't mind if also you verify that 
pkt_len never becomes either zero or exceeds skb-len (even equal to 
skb-len is not very nice). Things I tried to consider are (you might find 
some additional thing to consider besides these): 
  - outside snd_una-snd_nxt start/end_seq
  - start/end_seq equal
  - start/end_seq reversing
  - 2^31 wrap problems
  - after/before ambiguity (discussed some time ago on netdev)
I couldn't find a problem causing case because the successive before() and 
after() jails were so tight. Neither did my limited bruteforcer succeed 
(as you can see, I wasn't that convinced about by my analysis validity)... 

For sure you're curious enough - have a nice day with the negations... ;-)
...Seriously, double verification of that pkt_len part wouldn't hurt 
considering it's complexity.

-- 
 i.

[RFC PATCH tcp-2.6 1/2] [TCP]: Discard fuzzy SACK blocks

2007-06-18 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

SACK processing code has been sort of russian roulette as no
validation of SACK blocks is previously attempted. It is not
very clear what all kinds of broken SACK blocks really mean
(e.g., one that has start and end sequence numbers reversed).

This fixes also one remote triggerable NULL-ptr access:
start_seq was trusted as a valid mark_lost_entry_seq but
without validation it is relatively easy to inject an invalid
sequence number there that will cause a crash in the lost
marker code. Other SACK processing code seems safe so this could
have been fixed locally but that would leave SACK processing
hazardous in case of future modifications. IMHO, it's just
better to close the whole roulette rather than disarming just
one bullet.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |   40 
 1 files changed, 40 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4c8882e..ebd9739 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -977,6 +977,27 @@ static void tcp_update_reordering(struct sock *sk, const 
int metric,
  * so that we are allowed not to be bothered by order of our actions,
  * when multiple events arrive simultaneously. (see the function below).
  *
+ * SACK block validation.
+ * --
+ *
+ * SACK block range validation checks that the received SACK block fits to
+ * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
+ * Note that SND.UNA is not included to the range though being valid because
+ * it means that the receiver is rather inconsistent with itself (reports
+ * SACK reneging when it should advance SND.UNA).
+ *
+ * With D-SACK the lower bound is extended to cover sequence space below
+ * SND.UNA down to undo_marker, which is the last point of interest. But
+ * there all simplicity ends, TCP might receive valid D-SACKs below that.
+ * As long as they reside fully below undo_marker they do not affect
+ * behavior in anyway and can therefore be safely ignored. In rare cases
+ * (which are really theoretical ones), the D-SACK will nicely cross that
+ * boundary due to skb fragmentation and packet reordering past skb's
+ * retransmission. To consider them correctly, the acceptable range must
+ * be extended even more though the exact amount is rather hard to
+ * quantify (from the state TCP has?), so in boundary crossing case
+ * simply adjust start_seq to undo_marker.
+ *
  * Reordering detection.
  * 
  * Reordering metric is maximal distance, which a packet can be displaced
@@ -1243,9 +1264,28 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb,
struct sk_buff *skb;
__u32 start_seq = ntohl(sp-start_seq);
__u32 end_seq = ntohl(sp-end_seq);
+   u32 valid_range_start;
int fack_count;
state.dup_sack = (found_dup_sack  (i == 
state.first_sack_index));
 
+   /* SACK validation: start_seq == snd_una is non-sensical */
+   valid_range_start = tp-snd_una + 1;
+
+   if (state.dup_sack  tp-undo_marker) {
+   valid_range_start = tp-undo_marker;
+
+   /* Undo_marker boundary crossing, see comments above */
+   if (before(start_seq, tp-undo_marker) 
+   after(end_seq, tp-undo_marker))
+   start_seq = tp-undo_marker;
+   }
+
+   /* Discard SACK blocks that are seemingly bad */
+   if (before(start_seq, valid_range_start) ||
+   after(end_seq, tp-snd_nxt) ||
+   !before(start_seq, end_seq))
+   continue;
+
skb = cached_skb;
fack_count = cached_fack_count;
 
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH tcp-2.6 0/2]: SACK block validation

2007-06-18 Thread Ilpo Järvinen

Hi,

SACK block validation to tcp-2.6 tree. Before preparing this, I rebased
tcp-2.6 to net-2.6 (not 2.6.23) because of DSACK patch in net-2.6 (some
conflicts will occur). I did some very basic testing with this. No
TCPSACKDiscards occured in it but it's kind of hard for me to fully
test the whole internet without being a server end for most of it,
maybe I should start abusing all port probers for that purpose by using
them as sinks... :-)

This patchset addresses the only issue I found while trying to address
your earlier comments (found in here):
  http://marc.info/?l=linux-netdevm=118060110632768w=2
It's perfectly ok for timedout_continue to be NULL and it's being
checked for except when the result of find for tp-highest_sack is
accessed (that will be a valid skb because sacked_out  0). Actually
I didn't fully understand what was the concern with timedout_continue
rather than with skb... But this problem was located anyway, so the
comment itself proved useful (and DSACK regression got fixed too due
to it :-)).

If they it looks sane, you can consider applying to tcp-2.6 (rebasing
is needed for that). I'm not entirely sure if I can add MIB stuff
just like that (couldn't find any examples from the available history)
though some info will definately be useful when trying to figure out
regression reports (due to flawed TCP implementation of the peer).
Yes, it might cause some troubles when communicating with broken TCP
implementations.

Do you have some plans regarding tcp-2.6? FYI, I've also discovered
that the current implementation of timedout_mark_forward is flawed
because tcp_skb_timedout() boundary is advanced whenever
retransmissions there occur but I probably don't have time to address
that until mid-July or so. I think that the hint is not coming back
though as TCP can use a binary search that finds TCBCB_TAGBITS
boundary using the RB-tree. My other plans include, e.g., combining
highest_sack(+fackets_out) and sack fastpath hints but that requires
some benchmarking first to see how frequently they are different, I
suspect it won't really happen that often, so most of the time we have
three u32s and one skb ptr pointing the very same place (fackets stuff
is not really pointing but related like you said earlier)... :-)

--
 i.



-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[RFC PATCH tcp-2.6 2/2] [TCP] MIB: Add counter for discarded SACK blocks

2007-06-18 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/linux/snmp.h |1 +
 net/ipv4/proc.c  |1 +
 net/ipv4/tcp_input.c |4 +++-
 3 files changed, 5 insertions(+), 1 deletions(-)

diff --git a/include/linux/snmp.h b/include/linux/snmp.h
index 802b3a3..6e64a70 100644
--- a/include/linux/snmp.h
+++ b/include/linux/snmp.h
@@ -231,6 +231,7 @@ enum
LINUX_MIB_TCPABORTONLINGER, /* TCPAbortOnLinger */
LINUX_MIB_TCPABORTFAILED,   /* TCPAbortFailed */
LINUX_MIB_TCPMEMORYPRESSURES,   /* TCPMemoryPressures */
+   LINUX_MIB_TCPSACKDISCARD,   /* TCPSACKDiscard */
__LINUX_MIB_MAX
 };
 
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3b690cf..d5a8de0 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -244,6 +244,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM(TCPAbortOnLinger, LINUX_MIB_TCPABORTONLINGER),
SNMP_MIB_ITEM(TCPAbortFailed, LINUX_MIB_TCPABORTFAILED),
SNMP_MIB_ITEM(TCPMemoryPressures, LINUX_MIB_TCPMEMORYPRESSURES),
+   SNMP_MIB_ITEM(TCPSACKDiscard, LINUX_MIB_TCPSACKDISCARD),
SNMP_MIB_SENTINEL
 };
 
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ebd9739..e26d8ca 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1283,8 +1283,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb,
/* Discard SACK blocks that are seemingly bad */
if (before(start_seq, valid_range_start) ||
after(end_seq, tp-snd_nxt) ||
-   !before(start_seq, end_seq))
+   !before(start_seq, end_seq)) {
+   NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD);
continue;
+   }
 
skb = cached_skb;
fack_count = cached_fack_count;
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH] [TCP]: Fix logic breakage due to DSACK separation

2007-06-16 Thread Ilpo Järvinen

On Fri, 15 Jun 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Sat, 16 Jun 2007 02:04:25 +0300 (EEST)

  On Fri, 15 Jun 2007, David Miller wrote:

   Patch applied.

  ...I think it should go to stable as well.

 I thought Baruch's DSACK seperation that created this problem didn't
 go in until 2.6.22?

Then my gitk lies... :-) It claims for the particular commit:
Follows: v2.6.20
Preceeds: v2.6.21-rc1

$ git-describe 6f74651ae626ec672028587bc700538076dfbefb
v2.6.20-835-g6f74651

...It's definately in 2.6.21 already.

-- 
 i.

[PATCH net-2.6] [TCP]: Congestion control API RTT sampling fix

2007-06-15 Thread Ilpo Järvinen

Commit 164891aadf1721fca4dce473bb0e0998181537c6 broke RTT
sampling of congestion control modules. Inaccurate timestamps
could be fed to them without providing any way for them to
identify such cases. Previously RTT sampler was called only if
FLAG_RETRANS_DATA_ACKED was not set filtering inaccurate
timestamps nicely. In addition, the new behavior could give an
invalid timestamp (zero) to RTT sampler if only skbs with
TCPCB_RETRANS were ACKed. This solves both problems.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/linux/ktime.h   |   12 
 include/linux/skbuff.h  |4 
 net/ipv4/tcp_illinois.c |3 +++
 net/ipv4/tcp_input.c|6 +-
 net/ipv4/tcp_lp.c   |3 ++-
 net/ipv4/tcp_vegas.c|3 +++
 net/ipv4/tcp_veno.c |3 +++
 7 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index c762954..2b139f6 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -261,6 +261,18 @@ static inline s64 ktime_to_ns(const ktime_t kt)
 
 #endif
 
+/**
+ * ktime_equal - Compares two ktime_t variables to see if they are equal
+ * @cmp1:  comparable1
+ * @cmp2:  comparable2
+ *
+ * Compare two ktime_t variables, returns 1 if equal
+ */
+static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2)
+{
+   return cmp1.tv64 == cmp2.tv64;
+}
+
 static inline s64 ktime_to_us(const ktime_t kt)
 {
struct timeval tv = ktime_to_timeval(kt);
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e7367c7..6f0b2f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1579,6 +1579,10 @@ static inline ktime_t net_timedelta(ktime_t t)
return ktime_sub(ktime_get_real(), t);
 }
 
+static inline ktime_t net_invalid_timestamp(void)
+{
+   return ktime_set(0, 0);
+}
 
 extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
 extern __sum16 __skb_checksum_complete(struct sk_buff *skb);
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 4adc47c..b2b2256 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -90,6 +90,9 @@ static void tcp_illinois_acked(struct sock *sk, u32 
pkts_acked, ktime_t last)
 
ca-acked = pkts_acked;
 
+   if (ktime_equal(last, net_invalid_timestamp()))
+   return;
+
rtt = ktime_to_us(net_timedelta(last));
 
/* ignore bogus values, this prevents wraparound in alpha math */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d6d0f9b..aaf6f66 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2409,7 +2409,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
int acked = 0;
int prior_packets = tp-packets_out;
__s32 seq_rtt = -1;
-   ktime_t last_ackt = ktime_set(0,0);
+   ktime_t last_ackt = net_invalid_timestamp();
 
while ((skb = tcp_write_queue_head(sk)) 
   skb != tcp_send_head(sk)) {
@@ -2487,6 +2487,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
tcp_ack_update_rtt(sk, acked, seq_rtt);
tcp_ack_packets_out(sk);
 
+   /* Is the ACK triggering packet unambiguous? */
+   if (acked  FLAG_RETRANS_DATA_ACKED)
+   last_ackt = net_invalid_timestamp();
+
if (ca_ops-pkts_acked)
ca_ops-pkts_acked(sk, pkts_acked, last_ackt);
}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 43294ad..e49836c 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -266,7 +266,8 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 
num_acked, ktime_t last)
struct tcp_sock *tp = tcp_sk(sk);
struct lp *lp = inet_csk_ca(sk);
 
-   tcp_lp_rtt_sample(sk,  ktime_to_us(net_timedelta(last)));
+   if (!ktime_equal(last, net_invalid_timestamp()))
+   tcp_lp_rtt_sample(sk,  ktime_to_us(net_timedelta(last)));
 
/* calc inference */
if (tcp_time_stamp  tp-rx_opt.rcv_tsecr)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 73e19cf..e218a51 100644
--- a/net/ipv4/tcp_vegas.c
+++ b/net/ipv4/tcp_vegas.c
@@ -117,6 +117,9 @@ void tcp_vegas_pkts_acked(struct sock *sk, u32 cnt, ktime_t 
last)
struct vegas *vegas = inet_csk_ca(sk);
u32 vrtt;
 
+   if (ktime_equal(last, net_invalid_timestamp()))
+   return;
+
/* Never allow zero rtt or baseRTT */
vrtt = ktime_to_us(net_timedelta(last)) + 1;
 
diff --git a/net/ipv4/tcp_veno.c b/net/ipv4/tcp_veno.c
index 9edb340..ec854cc 100644
--- a/net/ipv4/tcp_veno.c
+++ b/net/ipv4/tcp_veno.c
@@ -74,6 +74,9 @@ static void tcp_veno_pkts_acked(struct sock *sk, u32 cnt, 
ktime_t last)
struct veno *veno = inet_csk_ca(sk);
u32 vrtt;
 
+   if (ktime_equal(last, net_invalid_timestamp()))
+   return;
+
/* Never allow zero rtt or baseRTT */
vrtt = ktime_to_us

[PATCH tcp-2.6 0/3]: cleanups

2007-06-15 Thread Ilpo Järvinen

Couple of relatively simple cleanups to tcp-2.6.

-- 
 i.


-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH tcp-2.6 1/3] [TCP]: Add tcp_dec_pcount_approx int variant

2007-06-15 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/net/tcp.h |   11 ---
 net/ipv4/tcp_output.c |8 ++--
 2 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6f88d13..d12154a 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -593,16 +593,21 @@ static inline int tcp_skb_mss(const struct sk_buff *skb)
return skb_shinfo(skb)-gso_size;
 }
 
-static inline void tcp_dec_pcount_approx(__u32 *count,
-const struct sk_buff *skb)
+static inline void tcp_dec_pcount_approx_int(__u32 *count, const int decr)
 {
if (*count) {
-   *count -= tcp_skb_pcount(skb);
+   *count -= decr;
if ((int)*count  0)
*count = 0;
}
 }
 
+static inline void tcp_dec_pcount_approx(__u32 *count,
+const struct sk_buff *skb)
+{
+   tcp_dec_pcount_approx_int(count, tcp_skb_pcount(skb));
+}
+
 static inline void tcp_packets_out_inc(struct sock *sk,
   const struct sk_buff *skb)
 {
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 23ee283..c8be1c8 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -738,15 +738,11 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, 
u32 len, unsigned int mss
if (diff  0) {
/* Adjust Reno SACK estimate. */
if (!tp-rx_opt.sack_ok) {
-   tp-sacked_out -= diff;
-   if ((int)tp-sacked_out  0)
-   tp-sacked_out = 0;
+   tcp_dec_pcount_approx_int(tp-sacked_out, 
diff);
tcp_sync_left_out(tp);
}
 
-   tp-fackets_out -= diff;
-   if ((int)tp-fackets_out  0)
-   tp-fackets_out = 0;
+   tcp_dec_pcount_approx_int(tp-fackets_out, diff);
}
}
 
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH tcp-2.6 2/3] [TCP]: Remove num_acked0 checks from cong.ctrl mods pkts_acked

2007-06-15 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

There is no need for such check in pkts_acked because the
callback is not invoked unless at least one segment got fully
ACKed (i.e., the snd_una moved past skb's end_seq) by the
cumulative ACK's snd_una advancement.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_bic.c  |2 +-
 net/ipv4/tcp_cubic.c|2 +-
 net/ipv4/tcp_westwood.c |3 +--
 3 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c
index 281c9f9..9ebd8ec 100644
--- a/net/ipv4/tcp_bic.c
+++ b/net/ipv4/tcp_bic.c
@@ -210,7 +210,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t 
last)
 {
const struct inet_connection_sock *icsk = inet_csk(sk);
 
-   if (cnt  0  icsk-icsk_ca_state == TCP_CA_Open) {
+   if (icsk-icsk_ca_state == TCP_CA_Open) {
struct bictcp *ca = inet_csk_ca(sk);
cnt -= ca-delayed_ack  ACK_RATIO_SHIFT;
ca-delayed_ack += cnt;
diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c
index 1422448..8be8ac4 100644
--- a/net/ipv4/tcp_cubic.c
+++ b/net/ipv4/tcp_cubic.c
@@ -338,7 +338,7 @@ static void bictcp_acked(struct sock *sk, u32 cnt, ktime_t 
last)
 {
const struct inet_connection_sock *icsk = inet_csk(sk);
 
-   if (cnt  0  icsk-icsk_ca_state == TCP_CA_Open) {
+   if (icsk-icsk_ca_state == TCP_CA_Open) {
struct bictcp *ca = inet_csk_ca(sk);
cnt -= ca-delayed_ack  ACK_RATIO_SHIFT;
ca-delayed_ack += cnt;
diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c
index e61e09d..0d56a5a 100644
--- a/net/ipv4/tcp_westwood.c
+++ b/net/ipv4/tcp_westwood.c
@@ -103,8 +103,7 @@ static void westwood_filter(struct westwood *w, u32 delta)
 static void tcp_westwood_pkts_acked(struct sock *sk, u32 cnt, ktime_t last)
 {
struct westwood *w = inet_csk_ca(sk);
-   if (cnt  0)
-   w-rtt = tcp_sk(sk)-srtt  3;
+   w-rtt = tcp_sk(sk)-srtt  3;
 }
 
 /*
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH tcp-2.6 3/3] [TCP]: Move sack_ok access to obviously named funcs cleanup

2007-06-15 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

Previously code had IsReno/IsFack defined as macros that were
local to tcp_input.c though sack_ok field has users elsewhere
too for the same purpose. This changes them to static inlines as
preferred according the current coding style and unifies the
access to sack_ok across multiple files. Magic bitops of sack_ok
for FACK and DSACK are also abstracted to functions with
appropriate names.

Note:
- One sack_ok = 1 remains but that's self explanary, i.e., it
  enables sack
- Couple of !IsReno cases are changed to tcp_is_sack
- There were no users for IsDSack = I dropped it

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/net/tcp.h|   28 
 net/ipv4/tcp.c   |2 +-
 net/ipv4/tcp_input.c |  103 +-
 net/ipv4/tcp_minisocks.c |2 +-
 net/ipv4/tcp_output.c|6 +-
 net/ipv4/tcp_timer.c |2 +-
 6 files changed, 90 insertions(+), 53 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index d12154a..dd9749e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -715,6 +715,34 @@ static inline void tcp_ca_event(struct sock *sk, const 
enum tcp_ca_event event)
icsk-icsk_ca_ops-cwnd_event(sk, event);
 }
 
+/* These functions determine how the current flow behaves in respect of SACK
+ * handling. SACK is negotiated with the peer, and therefore it can vary
+ * between different flows.
+ *
+ * tcp_is_sack - SACK enabled
+ * tcp_is_reno - No SACK
+ * tcp_is_fack - FACK enabled, implies SACK enabled
+ */
+static inline int tcp_is_sack(const struct tcp_sock *tp)
+{
+   return tp-rx_opt.sack_ok;
+}
+
+static inline int tcp_is_reno(const struct tcp_sock *tp)
+{
+   return !tcp_is_sack(tp);
+}
+
+static inline int tcp_is_fack(const struct tcp_sock *tp)
+{
+   return tp-rx_opt.sack_ok  2;
+}
+
+static inline void tcp_enable_fack(struct tcp_sock *tp)
+{
+   tp-rx_opt.sack_ok |= 2;
+}
+
 /* This determines how many packets are in the network to the best
  * of our knowledge.  In many cases it is conservative, but where
  * detailed information is available from the receiver (via SACK
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index bd4c295..e3de919 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -2000,7 +2000,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
 
if (tp-rx_opt.tstamp_ok)
info-tcpi_options |= TCPI_OPT_TIMESTAMPS;
-   if (tp-rx_opt.sack_ok)
+   if (tcp_is_sack(tp))
info-tcpi_options |= TCPI_OPT_SACK;
if (tp-rx_opt.wscale_ok) {
info-tcpi_options |= TCPI_OPT_WSCALE;
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index f3942c3..3efd4f5 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -108,11 +108,6 @@ int sysctl_tcp_abc __read_mostly;
 #define FLAG_CA_ALERT  (FLAG_DATA_SACKED|FLAG_ECE)
 #define FLAG_FORWARD_PROGRESS  (FLAG_ACKED|FLAG_DATA_SACKED)
 
-#define IsReno(tp) ((tp)-rx_opt.sack_ok == 0)
-#define IsFack(tp) ((tp)-rx_opt.sack_ok  2)
-#define IsDSack(tp) ((tp)-rx_opt.sack_ok  4)
-#define Is3517Sack(tp) (!IsReno(tp)  !IsFack(tp))
-
 #define IsSackFrto() (sysctl_tcp_frto == 0x2)
 
 #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
@@ -840,6 +835,21 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh)
}
 }
 
+/*
+ * Packet counting of FACK is based on in-order assumptions, therefore TCP
+ * disables it when reordering is detected
+ */
+static void tcp_disable_fack(struct tcp_sock *tp)
+{
+   tp-rx_opt.sack_ok = ~2;
+}
+
+/* Take a notice that peer is sending DSACKs */
+static void tcp_dsack_seen(struct tcp_sock *tp)
+{
+   tp-rx_opt.sack_ok |= 4;
+}
+
 /* Initialize metrics on socket. */
 
 static void tcp_init_metrics(struct sock *sk)
@@ -861,7 +871,7 @@ static void tcp_init_metrics(struct sock *sk)
}
if (dst_metric(dst, RTAX_REORDERING) 
tp-reordering != dst_metric(dst, RTAX_REORDERING)) {
-   tp-rx_opt.sack_ok = ~2;
+   tcp_disable_fack(tp);
tp-reordering = dst_metric(dst, RTAX_REORDERING);
}
 
@@ -923,9 +933,9 @@ static void tcp_update_reordering(struct sock *sk, const 
int metric,
/* This exciting event is worth to be remembered. 8) */
if (ts)
NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER);
-   else if (IsReno(tp))
+   else if (tcp_is_reno(tp))
NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER);
-   else if (IsFack(tp))
+   else if (tcp_is_fack(tp))
NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER);
else
NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER);
@@ -937,8 +947,7 @@ static void tcp_update_reordering(struct sock *sk, const 
int metric,
   tp-sacked_out

[PATCH] [TCP]: Fix logic breakage due to DSACK separation

2007-06-15 Thread Ilpo Järvinen

Commit 6f74651ae626ec672028587bc700538076dfbefb is found guilty
of breaking DSACK counting, which should be done only for the
SACK block reported by the DSACK instead of every SACK block
that is received along with DSACK information.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |9 +
 1 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d6d0f9b..c6a363b 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -953,7 +953,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
int prior_fackets;
u32 lost_retrans = 0;
int flag = 0;
-   int dup_sack = 0;
+   int found_dup_sack = 0;
int cached_fack_count;
int i;
int first_sack_index;
@@ -964,20 +964,20 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
 
/* Check for D-SACK. */
if (before(ntohl(sp[0].start_seq), TCP_SKB_CB(ack_skb)-ack_seq)) {
-   dup_sack = 1;
+   found_dup_sack = 1;
tp-rx_opt.sack_ok |= 4;
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV);
} else if (num_sacks  1 
!after(ntohl(sp[0].end_seq), ntohl(sp[1].end_seq)) 
!before(ntohl(sp[0].start_seq), 
ntohl(sp[1].start_seq))) {
-   dup_sack = 1;
+   found_dup_sack = 1;
tp-rx_opt.sack_ok |= 4;
NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV);
}
 
/* D-SACK for already forgotten data...
 * Do dumb counting. */
-   if (dup_sack 
+   if (found_dup_sack 
!after(ntohl(sp[0].end_seq), prior_snd_una) 
after(ntohl(sp[0].end_seq), tp-undo_marker))
tp-undo_retrans--;
@@ -1058,6 +1058,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb, u32 prior_snd_
__u32 start_seq = ntohl(sp-start_seq);
__u32 end_seq = ntohl(sp-end_seq);
int fack_count;
+   int dup_sack = (found_dup_sack  (i == first_sack_index));
 
skb = cached_skb;
fack_count = cached_fack_count;
-- 
1.5.0.6

Re: [PATCH] [TCP]: Fix logic breakage due to DSACK separation

2007-06-15 Thread Ilpo Järvinen

On Fri, 15 Jun 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Fri, 15 Jun 2007 16:07:37 +0300 (EEST)

  Commit 6f74651ae626ec672028587bc700538076dfbefb is found guilty
  of breaking DSACK counting, which should be done only for the
  SACK block reported by the DSACK instead of every SACK block
  that is received along with DSACK information.

  Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]

 Thanks for performing such rigorous audits and finding
 regressions like this!

They just come across, one thing leads to another, which again leads to 
another and so on... ...I was just trying to address the concerns you 
noted about tcp-2.6 patchset a while ago and came across other issues...

There are still some things I must think carefully in sacktag processing 
since it does not validate start_seq and end_seq at all which can be 
abused currently at least in tcp-2.6. ...I would rather put end to the 
whole russian roulette in tcp-2.6 sacktag rather than fix/think individual 
cases and leave future modifications of it similarily hazardous. It's not 
very clear to me how to handle all possible cases of invalid SACK blocks 
though, perhaps TCP should just ignore such sack blocks without doing 
any processing based on them, i.e., ignore them whenever start_seq-end_seq 
does not fully fit to snd_una-snd_nxt (expect DSACK of course, which 
should be processed if it's between undo_marker-snd_nxt). Do you have any 
thoughts about this?

It seems to me that net-2.6 is safe in this respect (probably just by 
accident) but the analysis wasn't that trivial, my main concern was 
tcp_fragment's pkt_len argument, if it ever becomes  skb-len, a boom 
would result (and perhaps zero has similar issues)! I couldn't find 
breakage analytically (but I could be wrong in it due to mind-boggling 
number of negations :-)) nor by bruteforcing seqno combinations near 0, 
skb-len and 2^31 wrap.

 Patch applied.

...I think it should go to stable as well.

-- 
 i.

[PATCH] [TCP]: Add missing break to TCP option parsing code

2007-06-14 Thread Ilpo Järvinen

This flaw does not affect any behavior (currently).

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |1 +
 1 files changed, 1 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index d506bdc..aaf6f66 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2936,6 +2936,7 @@ void tcp_parse_options(struct sk_buff *skb, struct 
tcp_options_received *opt_rx,
   opt_rx-sack_ok) {
TCP_SKB_CB(skb)-sacked = (ptr 
- 2) - (unsigned char *)th;
}
+   break;
 #ifdef CONFIG_TCP_MD5SIG
case TCPOPT_MD5SIG:
/*
-- 
1.5.0.6

[PATCH net-2.6 1/1] [TCP]: Fix left_out setting during FRTO

2007-06-12 Thread Ilpo Järvinen

Without FRTO, the tcp_try_to_open is never called with
lost_out  0 (see tcp_time_to_recover). However, when FRTO is
enabled, the !tp-lost condition is not used until end of FRTO
because that way TCP avoids premature entry to fast recovery
during FRTO.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---

This case was found during left_out drop audit (only relevant to
net-2.6 since tcp-2.6 does a right thing after left_out drop).


diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 74683d8..ed4a1bd 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2037,7 +2037,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
 {
struct tcp_sock *tp = tcp_sk(sk);
 
-   tp-left_out = tp-sacked_out;
+   tcp_sync_left_out(tp);
 
if (tp-retrans_out == 0)
tp-retrans_stamp = 0;
-- 
1.5.0.6

[RFC PATCH net-2.6] [TCP]: Congestion control API RTT sampling fix

2007-06-12 Thread Ilpo Järvinen

I was thinking something like this to fix the cc module breakage 
introduced by the API change (haven't tested it besides compile):


[RFC PATCH net-2.6] [TCP]: Congestion control API RTT sampling fix


Commit 164891aadf1721fca4dce473bb0e0998181537c6 broke RTT
sampling of congestion control modules. Inaccurate timestamps
could be fed to them without providing any way for them to
identify such cases. Previously RTT sampler was called only if
FLAG_RETRANS_DATA_ACKED was not set filtering inaccurate
timestamps nicely. In addition, the new behavior could give an
invalid timestamp (zero) to RTT sampler if only skbs with
TCPCB_RETRANS were ACKed. This solves both problems.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/linux/ktime.h   |   18 ++
 include/linux/skbuff.h  |4 
 net/ipv4/tcp_illinois.c |3 +++
 net/ipv4/tcp_input.c|6 +-
 net/ipv4/tcp_lp.c   |3 ++-
 net/ipv4/tcp_vegas.c|3 +++
 net/ipv4/tcp_veno.c |3 +++
 7 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/include/linux/ktime.h b/include/linux/ktime.h
index c762954..9f7fa3e 100644
--- a/include/linux/ktime.h
+++ b/include/linux/ktime.h
@@ -102,6 +102,12 @@ static inline ktime_t ktime_set(const long secs, const 
unsigned long nsecs)
 #define ktime_add_ns(kt, nsval) \
({ (ktime_t){ .tv64 = (kt).tv64 + (nsval) }; })
 
+/* Compare two ktime_t variables, returns 1 if equal */
+static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2)
+{
+   return cmp1.tv64 == cmp2.tv64;
+}
+
 /* convert a timespec to ktime_t format: */
 static inline ktime_t timespec_to_ktime(struct timespec ts)
 {
@@ -200,6 +206,18 @@ static inline ktime_t ktime_add(const ktime_t add1, const 
ktime_t add2)
 extern ktime_t ktime_add_ns(const ktime_t kt, u64 nsec);
 
 /**
+ * ktime_equal - Compares two ktime_t variables to see if they are equal
+ * @cmp1:  comparable1
+ * @cmp2:  comparable2
+ *
+ * Compare two ktime_t variables, returns 1 if equal
+ */
+static inline int ktime_equal(const ktime_t cmp1, const ktime_t cmp2)
+{
+   return !((cmp1.tv.sec ^ cmp2.tv.sec) | (cmp1.tv.usec ^ cmp2.tv.usec));
+}
+
+/**
  * timespec_to_ktime - convert a timespec to ktime_t format
  * @ts:the timespec variable to convert
  *
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index e7367c7..6f0b2f7 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1579,6 +1579,10 @@ static inline ktime_t net_timedelta(ktime_t t)
return ktime_sub(ktime_get_real(), t);
 }
 
+static inline ktime_t net_invalid_timestamp(void)
+{
+   return ktime_set(0, 0);
+}
 
 extern __sum16 __skb_checksum_complete_head(struct sk_buff *skb, int len);
 extern __sum16 __skb_checksum_complete(struct sk_buff *skb);
diff --git a/net/ipv4/tcp_illinois.c b/net/ipv4/tcp_illinois.c
index 4adc47c..5f8d01b 100644
--- a/net/ipv4/tcp_illinois.c
+++ b/net/ipv4/tcp_illinois.c
@@ -90,6 +90,9 @@ static void tcp_illinois_acked(struct sock *sk, u32 
pkts_acked, ktime_t last)
 
ca-acked = pkts_acked;
 
+   if (ktime_equal(last, net_invalid_timestamp())
+   return;
+
rtt = ktime_to_us(net_timedelta(last));
 
/* ignore bogus values, this prevents wraparound in alpha math */
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index ed4a1bd..d506bdc 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2409,7 +2409,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
int acked = 0;
int prior_packets = tp-packets_out;
__s32 seq_rtt = -1;
-   ktime_t last_ackt = ktime_set(0,0);
+   ktime_t last_ackt = net_invalid_timestamp();
 
while ((skb = tcp_write_queue_head(sk)) 
   skb != tcp_send_head(sk)) {
@@ -2487,6 +2487,10 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
tcp_ack_update_rtt(sk, acked, seq_rtt);
tcp_ack_packets_out(sk);
 
+   /* Is the ACK triggering packet unambiguous? */
+   if (acked  FLAG_RETRANS_DATA_ACKED)
+   last_ackt = net_invalid_timestamp();
+
if (ca_ops-pkts_acked)
ca_ops-pkts_acked(sk, pkts_acked, last_ackt);
}
diff --git a/net/ipv4/tcp_lp.c b/net/ipv4/tcp_lp.c
index 43294ad..efa358b 100644
--- a/net/ipv4/tcp_lp.c
+++ b/net/ipv4/tcp_lp.c
@@ -266,7 +266,8 @@ static void tcp_lp_pkts_acked(struct sock *sk, u32 
num_acked, ktime_t last)
struct tcp_sock *tp = tcp_sk(sk);
struct lp *lp = inet_csk_ca(sk);
 
-   tcp_lp_rtt_sample(sk,  ktime_to_us(net_timedelta(last)));
+   if (!ktime_equal(last, net_invalid_timestamp())
+   tcp_lp_rtt_sample(sk,  ktime_to_us(net_timedelta(last)));
 
/* calc inference */
if (tcp_time_stamp  tp-rx_opt.rcv_tsecr)
diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c
index 73e19cf..bd7a08f 100644
--- a/net/ipv4

Re: Suspicious fackets_out handling

2007-06-01 Thread Ilpo Järvinen

On Thu, 31 May 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Mon, 23 Apr 2007 14:28:21 +0300 (EEST)

  There are IMHO two problems in it. First of all, nothing ensures that the 
  skb TCP is fragmenting is actually below the forwardmost sack block (and 
  thus is included to the fackets_out)...

...this is so old thing already that I had to refresh my memory, 
haven't been expecting any answer for ages... :-)

 Good catch, I agree with your analysis completely.

...I later on understood (from a comment I found elsewhere) that 
fackets_out is occasionally estimated, rather than exact value (which 
could be completely fixed in tcp-2.6 due to presence of highest_sack but 
I'm also considering complete removal of it too, like you probably 
noticed).

  What I'm not sure of though, is how to fix this in net-2.6(.22), it
  is due to the fact that there is no pointer/seq which can be used in
  testing for it like in tcp-2.6 which has the highest_sack.

 We can add highest_sack to 2.6.22 in order to fix a bug like this,
 if necessary.

...I think we can leave it as estimate for now, it has been like that
for years and nobody has complained... :-) This problem covers really 
marginal number of cases anyway I think (isn't the diff usually
negative: old - new after fragment should be negative unless mss_now 
bites).

After I found the comments and analyzed some sacked_out code with NewReno, 
I think that usually these estimates are indicated in the code with 
tcp_dec_pcount_approx(...) but it wasn't used in this specific case 
because it inputs skb instead of int, I'll add clarification of this into 
my tcp-2.6 todo list... 

  Second problem is even more obvious: if adjustment here is being
  done and the sacktag code then uses fastpath at the arrival of the
  next ACK, the sacktag code will use a stale value from
  fastpath_cnt_hint and fails to notice all that math TCP did here
  unless we start clearing fastpath_skb_hint.

 Let's try not to clear fastpath_skb_hint here if possible :-)

Np, however, whatever we do, it wouldn't really execute that
often... :-)

 Is it possible to update fastpath_cnt_hint properly perhaps?

I think that would be valid and even accurate as it can checks skb's
seq against fastpath_skb_hint-seq. I'm in a hurry and will be a week
out of reach of internet connectivity but here's quick attempt to deal
with this cleanly. Compile tested (be extra careful with this one, it's 
done i a hurry :-)), consider to net-2.6. Considering the marginality of 
this issue, stable might really be an overkill for this one, only a
remotely valid concern comes to my mind in this case: possibility
of fackets_out  packet_out might not be dealt that cleanly everywhere 
(but I'm sure that you can come to a good decicion about it):

[PATCH] [TCP]: SACK fastpath did override adjusted fackets_out

Do same adjustment to SACK fastpath counters provided that
they're valid.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_output.c |9 +
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 53232dd..f24dd36 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -699,6 +699,15 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 
len, unsigned int mss
tp-fackets_out -= diff;
if ((int)tp-fackets_out  0)
tp-fackets_out = 0;
+   /* SACK fastpath might overwrite it unless dealt with */
+   if (tp-fastpath_skb_hint != NULL 
+   after(TCP_SKB_CB(tp-fastpath_skb_hint)-seq,
+ TCP_SKB_CB(skb)-seq)) {
+   tp-fastpath_cnt_hint -= diff;
+   if ((int)tp-fastpath_cnt_hint  0)
+   tp-fastpath_cnt_hint = 0;
+ 
+   }
}
}

-- 
1.5.0.6

Re: [PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules)

2007-06-01 Thread Ilpo Järvinen

On Thu, 31 May 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Wed, 30 May 2007 12:10:06 +0300 (EEST)

  [PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules)

  The code used to ignore GSO completely, passing either way too
  small or zero pkts_acked when GSO skb or part of it got ACKed.
  In addition, there is no need to calculate the value in the loop
  but simple arithmetics after the loop is sufficient. There is
  no need to handle SYN case specially because congestion control
  modules are not yet initialized when FLAG_SYN_ACKED is set.

  Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]

 Patch applied, thanks a lot!

...It's good to be on the alert with this one because it might uncover
some hidden bugs from congestion control module code (it changes some 
behavior so radically), which obviously will have to be then fixed
too...

-- 
 i.

Re: [PATCH 3/9] [TCP]: Tighten tcp_sock's belt, drop left_out

2007-05-31 Thread Ilpo Järvinen

Thanks for you comments...

On Thu, 31 May 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Sat, 26 May 2007 11:35:56 +0300

  It is easily calculable when needed and user are not that many
  after all.

 This looks good, but are you absolutely sure we never used
 a stale value of tp-left_out on purpose?

 I tried to audit this but there are so many cases :-)

Yes I know... So often complexity in these kind of things really
perplexes me too, in a way this is somewhat similar to the case with FRTO 
and no SACKed bits touched (which just looked very obvious and safe) but 
would have caused the dreaded S+L situation unless special handling for 
IsReno(tp) case would have been added.

 What I mean here is, was there a case where we updated
 sacked_out or lost_out, but then used left_out before
 it was updated, and we were doing this on purpose?

I understand what you mean even though I think that such approach would be 
very fragile if being done (and should be fixed). I was actually wondering 
earlier that could it be the only reason for the separate variable in the 
first place. Though so far I don't know any cases... ...but the code ain't 
always that trivial like you say :-), however, this after proven safe, 
will make it cleaner and will save next one to come from the same
wondering... ;-)

Anyway, I'll be try to do some grep tricks in order to reduce the number 
of cases that need an expensive manual audit to a smaller set where 
tcp_sync_left_out is not right after the adjustment. As plenty of 
functions end to tcp_sync_left_out without returning in between, it could 
help a lot, of course the calls within that function could have issues too 
but they're relatively easy to check then (common pattern is a trivial 
loop + sync as you know)... Usually only when the stale state leaks from 
a function, we're in troubles with the number of possible paths. ...I'll 
come back to this later on after verifying it and perhaps provide the 
same shortened list for you too so that you can see if I overlooked 
something... 

It might actually end up being easier to audit all tcp_packets_in_flight 
callers instead to see if there's always sync_left_out before them.

Another thing, I was thinking of renaming tcp_sync_left_out to 
tcp_verify_left_out now that it does only BUG traps... Do you have a hunch 
about this (from patch 8 msg): I wonder if we could get that BUG_ON place 
pinpointed more exactly in oops because now inlining makes it lose its 
original context as they always seem to be in tcp_ack, #define helps?
(i.e, how does it show up in oops if tcp_verify_left_out looks like this):

#define tcp_verify_left_out(tp) BUG_ON(...)

...does it still point to the tcp.h line  then or to the
tcp_sync_left_out(tp) @ line  in tcp_input.c? ...I'll test that
later on by myself in case you don't know the answer.

-- 
 i.

Re: [PATCH-2.4] Fix divide by 0 in vegas_cong_avoid()

2007-05-30 Thread Ilpo Järvinen

On Tue, 29 May 2007, Stephen Hemminger wrote:

 On Tue, 29 May 2007 20:23:45 +0200
 Lior Dotan [EMAIL PROTECTED] wrote:
 
  NTP was not running. I'm not sure what do you mean by fixing the -1.
  The trace shows that vegas_cong_avoid() is called with -1, and the
  only way it can happen is from tcp_clean_rtx_queue() and the patch
  should eliminate this. Another way of solving this is by checking
  vegas_rtt_calc() and see if it gets -1 and handle it there.
  Another thing that I don't understand is that some places like
  tcp_ack() declare seq_rtt as signed and some vegas_cong_avoid()
  declare it as unsigned. Shouldn't it be declared always as signed?
  
  On 5/29/07, Stephen Hemminger [EMAIL PROTECTED] wrote:
   On Tue, 29 May 2007 12:18:19 +0300
   Lior Dotan [EMAIL PROTECTED] wrote:
  
Hi,
   
I had a divide by zero on kernel 2.4.33 running with Vegas enabled.
 
 I don't think anyone has backported the other vegas fixes from 2.6.21
 to 2.4.
 
 
 For 2.6.22, rtt_calc doesn't exist, instead pkts_acked is called.

I tried to figure this one out yesterday, and it seems to me that divide 
by zero should not occur with 2.6.22 (nor with 2.6.21 code), no matter
how I try to approach vegas...

 The following should be added to avoid getting bogus timestamp values 
 from retransmits.

...but this is unreliable timestamps problem is a real one that originates 
from API merge commit 164891aadf1721fca4dce473bb0e0998181537c6 of yours 
(see some though about it below). I suppose there are now similar concerns 
about timestamp validity in other cc modules than vegas too.

 --- a/net/ipv4/tcp_vegas.c2007-05-02 12:26:35.0 -0700
 +++ b/net/ipv4/tcp_vegas.c2007-05-29 14:06:26.0 -0700
 @@ -117,6 +117,10 @@ void tcp_vegas_pkts_acked(struct sock *s
   struct vegas *vegas = inet_csk_ca(sk);
   u32 vrtt;
  
 + /* Ignore retransmits */
 + if (unlikely(cnt == 0))
 + return;
 +
   /* Never allow zero rtt or baseRTT */
   vrtt = ktime_to_us(net_timedelta(last)) + 1;

...I don't think this works, because cnt won't be zero ever because to 
make the call some skbs were cleaned from the queue (isn't that what 
pkts_acked stands for?). There is as if (ackedFLAG_ACKED) guard before 
making the pkts_acked call to cc modules, thus delta in packets_out will 
always be greater than 0. Hmm, now that I realize this, I would say that
checks for  0 in pkts_acked are entirely bogus (in the current code), 
hmm, that means I have more things to cleanup in tcp-2.6, at least, bic, 
cubic and westwood do them... :-).

I think the code did a right thing before your api merge, since it called 
rtt callback only if FLAG_RETRANS_DATA_ACKED was not set (and pkts_acked 
always), i.e., when TCP cannot know if it was the rexmission that 
triggered the cumulative ACK or any original transmission, no new RTT 
measurement should be made. Consider also the fact that, there might be a 
non rexmitted skb after rexmitted one, which Lior's patch doesn't do 
right way at all since the FLAG_DATA_ACKED would again get set (no 
div-by-zero would have occured then but an unreliable timestamp would
have been givento vegas in 2.4).

The number of packets that got cumulatively acked is never a right metric 
to measure whether the cumulative ACK originated from rexmitted skbs or 
not. ...Perhaps the FLAG_RETRANS_DATA_ACKED flag needs to be passed 
somehow to cc modules?


-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules)

2007-05-30 Thread Ilpo Järvinen

On Tue, 29 May 2007, Ilpo Järvinen wrote:

 [PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules)
 
 The code used to ignore GSO completely, passing either way too
 small or zero pkts_acked when GSO skb or part of it got ACKed.
 In addition, there is no need to calculate the value in the loop
 but simple arithmetics after the loop is sufficient. Pkts_acked
 callback of congestion control modules include SYN as a packet
 for now on.

Based on feedback from Stephen, I changed the commit msg bit
clearer, the patch remains the same.

[PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules)

The code used to ignore GSO completely, passing either way too
small or zero pkts_acked when GSO skb or part of it got ACKed.
In addition, there is no need to calculate the value in the loop
but simple arithmetics after the loop is sufficient. There is
no need to handle SYN case specially because congestion control
modules are not yet initialized when FLAG_SYN_ACKED is set.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 38cb25b..74683d8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2407,8 +2407,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
struct sk_buff *skb;
__u32 now = tcp_time_stamp;
int acked = 0;
+   int prior_packets = tp-packets_out;
__s32 seq_rtt = -1;
-   u32 pkts_acked = 0;
ktime_t last_ackt = ktime_set(0,0);
 
while ((skb = tcp_write_queue_head(sk)) 
@@ -2437,7 +2437,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
 */
if (!(scb-flags  TCPCB_FLAG_SYN)) {
acked |= FLAG_DATA_ACKED;
-   ++pkts_acked;
} else {
acked |= FLAG_SYN_ACKED;
tp-retrans_stamp = 0;
@@ -2481,6 +2480,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
}
 
if (ackedFLAG_ACKED) {
+   u32 pkts_acked = prior_packets - tp-packets_out;
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)-icsk_ca_ops;
 
-- 
1.5.0.6

Re: [PATCH-2.4] Fix divide by 0 in vegas_cong_avoid()

2007-05-30 Thread Ilpo Järvinen

On Wed, 30 May 2007, Stephen Hemminger wrote:

 On Wed, 30 May 2007 10:49:54 +0300 (EEST)
 Ilpo Järvinen [EMAIL PROTECTED] wrote:
 
  I think the code did a right thing before your api merge, since it called 
  rtt callback only if FLAG_RETRANS_DATA_ACKED was not set (and pkts_acked 
  always), i.e., when TCP cannot know if it was the rexmission that 
  triggered the cumulative ACK or any original transmission, no new RTT 
  measurement should be made. Consider also the fact that, there might be a 
  non rexmitted skb after rexmitted one, which Lior's patch doesn't do 
  right way at all since the FLAG_DATA_ACKED would again get set (no 
  div-by-zero would have occured then but an unreliable timestamp would
  have been givento vegas in 2.4).
  
  The number of packets that got cumulatively acked is never a right metric 
  to measure whether the cumulative ACK originated from rexmitted skbs or 
  not. ...Perhaps the FLAG_RETRANS_DATA_ACKED flag needs to be passed 
  somehow to cc modules?
 
 What about the mixed case where some retransmitted data and new
 data was covered by one ack. 

Without timestamps TCP cannot do anything better than consider a
cumulative ACK that includes any ever retransmitted segment as
inaccurate timestamp. Like in tcp_ack_no_tstamp:

if (flag  FLAG_RETRANS_DATA_ACKED)
return;

...I think this same applies to cc modules as well. With timestamps 
things are another beast.

 I would rather keep the merge, but think about the cases more
 carefully.

IMHO there's no problem with keeping the merge, you just need to add 
another parameter to pkts_acked (or by some other means indicate it) to 
distinguish RTT measurement with accurate timestamp from inaccurate ones. 
Maybe the first step (to 2.6.22 to avoid new bugs etc.) should be just to 
give the flag (for some funny reason, it's called acked in 
tcp_clean_rtx_queue) or use a single arg with 0/1 value fo depending on 
FLAG_RETRANS_DATA_ACKED and check that in the merged pkts_acked in the 
cases that were using the old rtt callback. It would allow retaining the 
previous (correct) functionality as it was before the API merge. That 
would prevent potential regressions in cc modules due to this merge. 

...Then perhaps come up with a solution that takes advantage of timestamps 
more aggressively to 2.6.23 like the core rtt estimator already does. 


-- 
 i.

Re: [PATCH 0/9]: tcp-2.6 patchset

2007-05-29 Thread Ilpo Järvinen

On Tue, 29 May 2007, Stephen Hemminger wrote:

 On Mon, 28 May 2007 13:27:03 +0300 (EEST)
 Ilpo Järvinen [EMAIL PROTECTED] wrote:
 
  On Sun, 27 May 2007, Ilpo Järvinen wrote:
  
  [PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules)
 
 Yes, thanks for fixing this. Wonder how it affects measurements.

...It's a bit hard to tell since dynamics change so dramatically 
in  0 check cases, the resulting behavior in too small value cases
may be easier to predict... It's possible that this could explain some 
anomalities you've been seeing in your measurements.

  It is not very clear how SYN segments should be handled, so I
  choose to follow the previous implementation in this respect.
 
 Since we don't invoke congestion control modules until after the SYN
 handshake this is not a problem.

Just curious, do you mean that cc modules cannot measure, e.g., initial
RTT through this mechanism (though they could do that in init() cb then
I suppose)... Or do you mean that they are called already for the ACK
that completes the SYN handshake and therefore its skb is being cleaned 
from the queue right now (this is the case I above refer to)? 
In the first case the decrementer code is NOP. If the latter, then it
is just interface specification question, i.e., if SYNs are treated as 
zero or one in num_acked for the pkts_acked callback (I have no opinion
on this but was just trying to make sure cc modules get what they
expect :-)). 


-- 
 i.

Re: [PATCH 0/9]: tcp-2.6 patchset

2007-05-29 Thread Ilpo Järvinen

On Tue, 29 May 2007, Stephen Hemminger wrote:

 On Tue, 29 May 2007 23:07:00 +0300 (EEST)
 Ilpo Järvinen [EMAIL PROTECTED] wrote:
 
  On Tue, 29 May 2007, Stephen Hemminger wrote:
  
   Since we don't invoke congestion control modules until after the SYN
   handshake this is not a problem.
  
  Just curious, do you mean that cc modules cannot measure, e.g., initial
  RTT through this mechanism (though they could do that in init() cb then
  I suppose)... Or do you mean that they are called already for the ACK
  that completes the SYN handshake and therefore its skb is being cleaned 
  from the queue right now (this is the case I above refer to)? 
  In the first case the decrementer code is NOP. If the latter, then it
  is just interface specification question, i.e., if SYNs are treated as 
  zero or one in num_acked for the pkts_acked callback (I have no opinion
  on this but was just trying to make sure cc modules get what they
  expect :-)). 
 
 We don't switch a socket out of Reno until after the initial handshake.

...It's still not very clear to me what exactly your after means (both 
here and in your earlier description), i.e., whether clean_rtx_queue call 
that cleans SYN skb from the queue happens before or after the switch out 
of reno or not... If I understand the code correctly, this specific 
clean_rtx_queue call happens after your after but I could be 
misunderstanding the current 3-way handshake code. :-)

 As an interface, it makes sense to keep the API with the SYN counting
 as a packet.

Ok, this one answers the remaining question concerning the patch, here
is it without the decrementer for SYN case (which IMHO wasn't very 
beautiful looking anyway :-)).

Dave, please consider this to net-2.6. It could be a stable candidate 
as well, haven't tested yet if it applies cleanly to stable:


[PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules)

The code used to ignore GSO completely, passing either way too
small or zero pkts_acked when GSO skb or part of it got ACKed.
In addition, there is no need to calculate the value in the loop
but simple arithmetics after the loop is sufficient. Pkts_acked
callback of congestion control modules include SYN as a packet
for now on.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 38cb25b..74683d8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2407,8 +2407,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
struct sk_buff *skb;
__u32 now = tcp_time_stamp;
int acked = 0;
+   int prior_packets = tp-packets_out;
__s32 seq_rtt = -1;
-   u32 pkts_acked = 0;
ktime_t last_ackt = ktime_set(0,0);
 
while ((skb = tcp_write_queue_head(sk)) 
@@ -2437,7 +2437,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
 */
if (!(scb-flags  TCPCB_FLAG_SYN)) {
acked |= FLAG_DATA_ACKED;
-   ++pkts_acked;
} else {
acked |= FLAG_SYN_ACKED;
tp-retrans_stamp = 0;
@@ -2481,6 +2480,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
}
 
if (ackedFLAG_ACKED) {
+   u32 pkts_acked = prior_packets - tp-packets_out;
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)-icsk_ca_ops;
 
-- 
1.5.0.6

Re: [PATCH 0/9]: tcp-2.6 patchset

2007-05-28 Thread Ilpo Järvinen

On Sun, 27 May 2007, Ilpo Järvinen wrote:

 On Sun, 27 May 2007, Baruch Even wrote:
 
  * Ilpo J?rvinen [EMAIL PROTECTED] [070527 14:16]:
   
   Thus, my original question basically culminates in this: should cc
   modules be passed number of packets acked or number of skbs acked?
   ...The latter makes no sense to me unless the value is intented to
   be interpreted as number of timestamps acked or something along those 
   lines. ...I briefly tried looking up for documentation for cc module 
   interface but didn't find anything useful about this, and thus asked in 
   the first place...
  
  At least the htcp module that I wrote assumes that the number is actual
  number of tcp packets so GSO should be considered.
 
 Thanks for the info! It is what I suspected... ...I'll write a patch for 
 it tomorrow against net-2.6... Dave, beware that it will partially 
 overlap with the changes made in the patch 8, so you might choose to put 
 the patch 8 on hold until this issue is first resolved...
 
  The consequences of this bug are not too large but it does make all
  congestion control algorithms a lot less aggressive. On my machines GSO
  is disabled by default (e1000 at 100mbps  Tigon3 @ 1Gbps).
 
 Agreed, that's my impression too. However, some algorithms do things
 like  0 checks for it, so it might disturb their dynamics even more
 than in the too small value cases...

Hmm, there seems to be another case that I'm not too sure of...
Please check the alternative I choose for SYN handling below...

...hmm... While exploring this SYN thingie, I noticed that commit 
164891aadf1721fca4dce473bb0e0998181537c6 drops !FLAG_RETRANS_DATA_ACKED 
check from rtt_sample call (when combining it with pkts_acked call).
I hope that's intentional?!? ...the commit message didn't say anything 
about it nor was anything in cc modules changed to accomodate that.


[PATCH] [TCP]: Fix GSO ignorance of pkts_acked arg (cong.cntrl modules)

The code used to ignore GSO completely, passing either way too
small or zero pkts_acked when GSO skb or part of it got ACKed.
In addition, there is no need to calculate the value in the loop
but simple arithmetics after the loop is sufficient.

It is not very clear how SYN segments should be handled, so I
choose to follow the previous implementation in this respect.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |7 +--
 1 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 38cb25b..7dfaabb 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -2407,8 +2407,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
struct sk_buff *skb;
__u32 now = tcp_time_stamp;
int acked = 0;
+   int prior_packets = tp-packets_out;
__s32 seq_rtt = -1;
-   u32 pkts_acked = 0;
ktime_t last_ackt = ktime_set(0,0);
 
while ((skb = tcp_write_queue_head(sk)) 
@@ -2437,7 +2437,6 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
 */
if (!(scb-flags  TCPCB_FLAG_SYN)) {
acked |= FLAG_DATA_ACKED;
-   ++pkts_acked;
} else {
acked |= FLAG_SYN_ACKED;
tp-retrans_stamp = 0;
@@ -2481,9 +2480,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 
*seq_rtt_p)
}
 
if (ackedFLAG_ACKED) {
+   u32 pkts_acked = prior_packets - tp-packets_out;
const struct tcp_congestion_ops *ca_ops
= inet_csk(sk)-icsk_ca_ops;
 
+   if (acked  FLAG_SYN_ACKED)
+   pkts_acked--;
+
tcp_ack_update_rtt(sk, acked, seq_rtt);
tcp_ack_packets_out(sk);
 
-- 
1.5.0.6

Re: [PATCH 6/9] [TCP]: Reorganize lost marking code

2007-05-27 Thread Ilpo Järvinen

On Sat, 26 May 2007, Ilpo Järvinen wrote:

 -static void tcp_update_scoreboard_fack(struct sock *sk, u32 entry_seq,
 -int fast_rexmit)
 +struct sk_buff *tcp_update_scoreboard_fack(struct sock *sk, u32 entry_seq,
 +int fast_rexmit)

Noticed that the static got dropped here unintentionally, 2nd try included 
below.


[PATCH 6/9] [TCP]: Reorganize lost marking code

The indentation started to get scary, so I reorganized code so
that some trivial ifs are in tcp_update_scoreboard and the main
loops remain in tcp_update_scoreboard_fack.

It's much easier to view the actual changes with git-diff -w
than from the messy looking (default) diff.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |  157 +-
 1 files changed, 79 insertions(+), 78 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 452a3b0..c5eda37 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1866,7 +1866,6 @@ static void tcp_mark_head_lost_single(struct sock *sk)
before(tp-snd_una, tp-high_seq)) {
TCP_SKB_CB(skb)-sacked |= TCPCB_LOST;
tp-lost_out += tcp_skb_pcount(skb);
-   tcp_sync_left_out(tp);
}
 }
 
@@ -1905,99 +1904,85 @@ static void tcp_mark_head_lost_single(struct sock *sk)
  * Key difference here is that FACK uses both SACK blocks and holes while
  * RFC3517 is using only SACK blocks when counting for reordering.
  */
-static void tcp_update_scoreboard_fack(struct sock *sk, u32 entry_seq,
-  int fast_rexmit)
+static struct sk_buff *tcp_update_scoreboard_fack(struct sock *sk,
+ u32 entry_seq,
+ int fast_rexmit)
 {
struct tcp_sock *tp = tcp_sk(sk);
/* Beware: timedout_continue might be pointing to sk_write_queue */
struct sk_buff *timedout_continue = NULL;
struct sk_buff *skb;
-
-   if (!tp-sacked_out) {
-   if (IsFack(tp) || fast_rexmit) {
-   tcp_mark_head_lost_single(sk);
-   skb = tcp_write_queue_head(sk);
-   timedout_continue = tcp_write_queue_next(sk, skb);
-   }
-
+   unsigned int holes_seen = 0;
+
+   skb = tcp_write_queue_find(sk, entry_seq);
+   /* If this ever becomes expensive, it can be delayed */
+   timedout_continue = tcp_write_queue_next(sk, skb);
+   if (entry_seq != tp-highest_sack) {
+   /* Not interested in the last SACKed one we got */
+   /* RFC: find_below could help here too */
+   skb = tcp_write_queue_prev(sk, skb);
+   if (IsFack(tp)  tcp_skb_timedout(sk, skb) 
+   (tp-fackets_out  tp-packets_out)) {
+   timedout_continue = tcp_write_queue_find(sk, 
tp-highest_sack);
+   timedout_continue = tcp_write_queue_next(sk, 
timedout_continue);
+   } else
+   timedout_continue = NULL;
} else {
-   unsigned int holes_seen = 0;
-
-   skb = tcp_write_queue_find(sk, entry_seq);
-   /* If this ever becomes expensive, it can be delayed */
-   timedout_continue = tcp_write_queue_next(sk, skb);
-   if (entry_seq != tp-highest_sack) {
-   /* Not interested in the last SACKed one we got */
-   /* RFC: find_below could help here too */
-   skb = tcp_write_queue_prev(sk, skb);
-   if (IsFack(tp)  tcp_skb_timedout(sk, skb) 
-   (tp-fackets_out  tp-packets_out)) {
-   timedout_continue = tcp_write_queue_find(sk, 
tp-highest_sack);
-   timedout_continue = tcp_write_queue_next(sk, 
timedout_continue);
-   } else
-   timedout_continue = NULL;
-   } else {
-   unsigned int reord_count = IsFack(tp) ? 0 : 1;
-
-   /* Phase I: Search until TCP can mark */
-   tcp_for_write_queue_backwards_from(skb, sk) {
-   if ((tp-fackets_out = tp-sacked_out +
-   tp-lost_out +
-   holes_seen) ||
-   (TCP_SKB_CB(skb)-sacked  TCPCB_LOST))
-   goto backwards_walk_done;
-
-   if (IsFack(tp)  tcp_skb_timedout(sk, skb))
-   break;
-   else
-   timedout_continue = NULL;
-
-   if (IsFack(tp

Re: [PATCH 0/9]: tcp-2.6 patchset

2007-05-27 Thread Ilpo Järvinen

On Sat, 26 May 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Sat, 26 May 2007 11:35:53 +0300

  Dave, you could consider applying other than the last one if they
  seem ok to you too (you'll need to rebase your tcp-2.6 in that case
  first to apply cleanly those that touch tcp_sync_left_out :-)).

 Absolutely, I'll do the quick rebase of tcp-2.6 and review +
 applying of your patches (except the last one) over the weekend.

While you're in the right context (reviewing patch 8), you could also
look if tcp_clean_rtx_queue does a right thing when passing a strange 
pkts_acked to congestion control modules. I wonder if it really should 
ignore GSO the way it does currently... I read some cc module code and 
some was adding it to snd_cwnd_cnt, etc. which is a strong indication 
that GSO should be considered... Also if the head is GSO skb that is not 
completely acked, the loop breaks with pkts_acked being zero, I doubt
that can be correct... 

...added Stephen H.  Herbert X. to this thread since also they might have 
some idea about it.

...Anyway, if it's wrong, the fix should go to net-2.6 rather than to 
tcp-2.6...

-- 
 i.

Re: [PATCH 0/9]: tcp-2.6 patchset

2007-05-27 Thread Ilpo Järvinen

On Sun, 27 May 2007, David Miller wrote:

 From: Ilpo_Järvinen [EMAIL PROTECTED]
 Date: Sun, 27 May 2007 10:58:27 +0300 (EEST)

  While you're in the right context (reviewing patch 8), you could also
  look if tcp_clean_rtx_queue does a right thing when passing a strange 
  pkts_acked to congestion control modules. I wonder if it really should 
  ignore GSO the way it does currently... I read some cc module code and 
  some was adding it to snd_cwnd_cnt, etc. which is a strong indication 
  that GSO should be considered... Also if the head is GSO skb that is not 
  completely acked, the loop breaks with pkts_acked being zero, I doubt
  that can be correct... 

 [...snip...]
 will likely take a look at these issues wrt. patch 8 tomorrow.

...I hope I got myself understood correctly (with my non-native
English :-))... ...My intention was to say that there might be
a bug in tcp_clean_rtx_queue too, which on logical level is
unrelated to the patch 8 itself and to the change it makes (but
caught my attention while I was doing that patch as it resides in
the same function). ...There could have been confusion since I'm too 
changing things in the same function and because my article usage is 
usually far from perfect... :-) Also the fastretrans_alert arg rename
to the same name (i.e., pkts_acked) in patch 8 is unrelated to the 
potential bug.

Thus, my original question basically culminates in this: should cc
modules be passed number of packets acked or number of skbs acked?
...The latter makes no sense to me unless the value is intented to
be interpreted as number of timestamps acked or something along those 
lines. ...I briefly tried looking up for documentation for cc module 
interface but didn't find anything useful about this, and thus asked in 
the first place...

-- 
 i.

Re: [PATCH 0/9]: tcp-2.6 patchset

2007-05-27 Thread Ilpo Järvinen

On Sun, 27 May 2007, Baruch Even wrote:

 * Ilpo J?rvinen [EMAIL PROTECTED] [070527 14:16]:
  
  Thus, my original question basically culminates in this: should cc
  modules be passed number of packets acked or number of skbs acked?
  ...The latter makes no sense to me unless the value is intented to
  be interpreted as number of timestamps acked or something along those 
  lines. ...I briefly tried looking up for documentation for cc module 
  interface but didn't find anything useful about this, and thus asked in 
  the first place...
 
 At least the htcp module that I wrote assumes that the number is actual
 number of tcp packets so GSO should be considered.

Thanks for the info! It is what I suspected... ...I'll write a patch for 
it tomorrow against net-2.6... Dave, beware that it will partially 
overlap with the changes made in the patch 8, so you might choose to put 
the patch 8 on hold until this issue is first resolved...

 The consequences of this bug are not too large but it does make all
 congestion control algorithms a lot less aggressive. On my machines GSO
 is disabled by default (e1000 at 100mbps  Tigon3 @ 1Gbps).

Agreed, that's my impression too. However, some algorithms do things
like  0 checks for it, so it might disturb their dynamics even more
than in the too small value cases...


-- 
 i.
-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 0/9]: tcp-2.6 patchset

2007-05-26 Thread Ilpo Järvinen

Hi,

Here are some changes to TCP I've been baking. Before doing this
patchset, I rebased tcp-2.6 branch to the current net-2.6 (goes
almost cleanly) because there are some depencies to the TCP work in
there.

I booted these today and no very obvious problems showed up (OOPSes,
BUG()s, reported scoreboard leaks, etc.) while I did some transfers
with varying sysctl settings: SACK/FACK on/off and FRTO enabled).
However, no in depth analysis on behavior has been made.

The last patch is highly experimental (and incomplete), it should
NOT be applied yet. I suspect that there are at least off-by-MSS
things in it.

I'm asking for your comments.


Dave, you could consider applying other than the last one if they
seem ok to you too (you'll need to rebase your tcp-2.6 in that case
first to apply cleanly those that touch tcp_sync_left_out :-)).

-- 
 i.

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 2/9] [TCP] FRTO: remove unnecessary fackets/sacked_out recounting

2007-05-26 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

F-RTO does not touch SACKED_ACKED bits at all, so there is no
need to recount them in tcp_enter_frto_loss. After removal of
the else branch, nested ifs can be combined.

This must also reset sacked_out when SACK is not in use as TCP
could have received some duplicate ACKs prior RTO. To achieve
that in a sane manner, tcp_reset_reno_sack was re-placed by the
previous patch.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |   23 +++
 1 files changed, 7 insertions(+), 16 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 925a26f..4dab5d7 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1479,17 +1479,15 @@ static void tcp_enter_frto_loss(struct sock *sk, int 
allowed_segments, int flag)
 {
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
-   int cnt = 0;
 
-   tp-sacked_out = 0;
tp-lost_out = 0;
-   tp-fackets_out = 0;
tp-retrans_out = 0;
+   if (IsReno(tp))
+   tcp_reset_reno_sack(tp);
 
tcp_for_write_queue(skb, sk) {
if (skb == tcp_send_head(sk))
break;
-   cnt += tcp_skb_pcount(skb);
/*
 * Count the retransmission made on RTO correctly (only when
 * waiting for the first ACK and did not get it)...
@@ -1501,19 +1499,12 @@ static void tcp_enter_frto_loss(struct sock *sk, int 
allowed_segments, int flag)
} else {
TCP_SKB_CB(skb)-sacked = 
~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
}
-   if (!(TCP_SKB_CB(skb)-sackedTCPCB_SACKED_ACKED)) {
 
-   /* Do not mark those segments lost that were
-* forward transmitted after RTO
-*/
-   if (!after(TCP_SKB_CB(skb)-end_seq,
-  tp-frto_highmark)) {
-   TCP_SKB_CB(skb)-sacked |= TCPCB_LOST;
-   tp-lost_out += tcp_skb_pcount(skb);
-   }
-   } else {
-   tp-sacked_out += tcp_skb_pcount(skb);
-   tp-fackets_out = cnt;
+   /* Don't lost mark skbs that were fwd transmitted after RTO */
+   if (!(TCP_SKB_CB(skb)-sackedTCPCB_SACKED_ACKED) 
+   !after(TCP_SKB_CB(skb)-end_seq, tp-frto_highmark)) {
+   TCP_SKB_CB(skb)-sacked |= TCPCB_LOST;
+   tp-lost_out += tcp_skb_pcount(skb);
}
}
tcp_sync_left_out(tp);
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

[PATCH 3/9] [TCP]: Tighten tcp_sock's belt, drop left_out

2007-05-26 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

It is easily calculable when needed and user are not that many
after all.

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 include/linux/tcp.h  |1 -
 include/net/tcp.h|4 ++--
 net/ipv4/tcp_input.c |   11 ++-
 net/ipv4/tcp_minisocks.c |1 -
 net/ipv4/tcp_output.c|   12 +++-
 5 files changed, 7 insertions(+), 22 deletions(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index 728321f..63e04f4 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -305,7 +305,6 @@ struct tcp_sock {
u32 rtt_seq;/* sequence number to update rttvar */
 
u32 packets_out;/* Packets which are in flight*/
-   u32 left_out;   /* Packets which leaved network */
u32 retrans_out;/* Retransmitted packets out*/
 /*
  *  Options received (usually on last packet, some only on SYN packets).
diff --git a/include/net/tcp.h b/include/net/tcp.h
index dd58cf2..5d866a4 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -716,7 +716,8 @@ static inline void tcp_ca_event(struct sock *sk, const enum 
tcp_ca_event event)
  */
 static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp)
 {
-   return (tp-packets_out - tp-left_out + tp-retrans_out);
+   return tp-packets_out - (tp-sacked_out + tp-lost_out) +
+   tp-retrans_out;
 }
 
 /* If cwnd  ssthresh, we may raise ssthresh to be half-way to cwnd.
@@ -738,7 +739,6 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp)
 {
BUG_ON(tp-rx_opt.sack_ok 
   (tp-sacked_out + tp-lost_out  tp-packets_out));
-   tp-left_out = tp-sacked_out + tp-lost_out;
 }
 
 extern void tcp_enter_cwr(struct sock *sk, const int set_ssthresh);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4dab5d7..c17e077 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1301,8 +1301,6 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb,
}
}
 
-   tp-left_out = tp-sacked_out + tp-lost_out;
-
if ((state.reord  tp-fackets_out)  icsk-icsk_ca_state != 
TCP_CA_Loss 
(!tp-frto_highmark || after(tp-snd_una, tp-frto_highmark)))
tcp_update_reordering(sk, ((tp-fackets_out + 1) - 
state.reord), 0);
@@ -1363,7 +1361,6 @@ static void tcp_remove_reno_sacks(struct sock *sk, int 
acked)
 static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
 {
tp-sacked_out = 0;
-   tp-left_out = tp-lost_out;
 }
 
 int tcp_use_frto(struct sock *sk)
@@ -1526,7 +1523,6 @@ static void tcp_enter_frto_loss(struct sock *sk, int 
allowed_segments, int flag)
 
 void tcp_clear_retrans(struct tcp_sock *tp)
 {
-   tp-left_out = 0;
tp-retrans_out = 0;
 
tp-fackets_out = 0;
@@ -2023,7 +2019,7 @@ static void DBGUNDO(struct sock *sk, const char *msg)
printk(KERN_DEBUG Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n,
   msg,
   NIPQUAD(inet-daddr), ntohs(inet-dport),
-  tp-snd_cwnd, tp-left_out,
+  tp-snd_cwnd, tp-sacked_out + tp-lost_out,
   tp-snd_ssthresh, tp-prior_ssthresh,
   tp-packets_out);
 }
@@ -2152,7 +2148,6 @@ static int tcp_try_undo_loss(struct sock *sk)
 
DBGUNDO(sk, partial loss);
tp-lost_out = 0;
-   tp-left_out = tp-sacked_out;
tcp_undo_cwr(sk, 1);
NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO);
inet_csk(sk)-icsk_retransmits = 0;
@@ -2176,8 +2171,6 @@ static void tcp_try_to_open(struct sock *sk, int flag)
 {
struct tcp_sock *tp = tcp_sk(sk);
 
-   tp-left_out = tp-sacked_out;
-
if (tp-retrans_out == 0)
tp-retrans_stamp = 0;
 
@@ -2187,7 +2180,7 @@ static void tcp_try_to_open(struct sock *sk, int flag)
if (inet_csk(sk)-icsk_ca_state != TCP_CA_CWR) {
int state = TCP_CA_Open;
 
-   if (tp-left_out || tp-retrans_out || tp-undo_marker)
+   if (tp-sacked_out || tp-retrans_out || tp-undo_marker)
state = TCP_CA_Disorder;
 
if (inet_csk(sk)-icsk_ca_state != state) {
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 42588eb..598ddec 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -399,7 +399,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, 
struct request_sock *req,
newicsk-icsk_rto = TCP_TIMEOUT_INIT;
 
newtp-packets_out = 0;
-   newtp-left_out = 0;
newtp-retrans_out = 0;
newtp-sacked_out = 0;
newtp-fackets_out = 0;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8891775..d67fb3d 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -682,10 +682,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff

[PATCH 1/9] [TCP]: Move Reno SACKed_out counter functions earlier

2007-05-26 Thread Ilpo Järvinen

From: =?ISO-8859-1?q?Ilpo_J=E4rvinen?= [EMAIL PROTECTED]

Signed-off-by: Ilpo Järvinen [EMAIL PROTECTED]
---
 net/ipv4/tcp_input.c |   98 --
 1 files changed, 47 insertions(+), 51 deletions(-)

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index cb81e31..925a26f 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -1319,6 +1319,53 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff 
*ack_skb,
 /* F-RTO can only be used if TCP has never retransmitted anything other than
  * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here)
  */
+static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+{
+   struct tcp_sock *tp = tcp_sk(sk);
+   u32 holes;
+
+   holes = max(tp-lost_out, 1U);
+   holes = min(holes, tp-packets_out);
+
+   if ((tp-sacked_out + holes)  tp-packets_out) {
+   tp-sacked_out = tp-packets_out - holes;
+   tcp_update_reordering(sk, tp-packets_out + addend, 0);
+   }
+}
+
+/* Emulate SACKs for SACKless connection: account for a new dupack. */
+
+static void tcp_add_reno_sack(struct sock *sk)
+{
+   struct tcp_sock *tp = tcp_sk(sk);
+   tp-sacked_out++;
+   tcp_check_reno_reordering(sk, 0);
+   tcp_sync_left_out(tp);
+}
+
+/* Account for ACK, ACKing some data in Reno Recovery phase. */
+
+static void tcp_remove_reno_sacks(struct sock *sk, int acked)
+{
+   struct tcp_sock *tp = tcp_sk(sk);
+
+   if (acked  0) {
+   /* One ACK acked hole. The rest eat duplicate ACKs. */
+   if (acked-1 = tp-sacked_out)
+   tp-sacked_out = 0;
+   else
+   tp-sacked_out -= acked-1;
+   }
+   tcp_check_reno_reordering(sk, acked);
+   tcp_sync_left_out(tp);
+}
+
+static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
+{
+   tp-sacked_out = 0;
+   tp-left_out = tp-lost_out;
+}
+
 int tcp_use_frto(struct sock *sk)
 {
const struct tcp_sock *tp = tcp_sk(sk);
@@ -1734,57 +1781,6 @@ static int tcp_time_to_recover(struct sock *sk)
return 0;
 }
 
-/* If we receive more dupacks than we expected counting segments
- * in assumption of absent reordering, interpret this as reordering.
- * The only another reason could be bug in receiver TCP.
- */
-static void tcp_check_reno_reordering(struct sock *sk, const int addend)
-{
-   struct tcp_sock *tp = tcp_sk(sk);
-   u32 holes;
-
-   holes = max(tp-lost_out, 1U);
-   holes = min(holes, tp-packets_out);
-
-   if ((tp-sacked_out + holes)  tp-packets_out) {
-   tp-sacked_out = tp-packets_out - holes;
-   tcp_update_reordering(sk, tp-packets_out + addend, 0);
-   }
-}
-
-/* Emulate SACKs for SACKless connection: account for a new dupack. */
-
-static void tcp_add_reno_sack(struct sock *sk)
-{
-   struct tcp_sock *tp = tcp_sk(sk);
-   tp-sacked_out++;
-   tcp_check_reno_reordering(sk, 0);
-   tcp_sync_left_out(tp);
-}
-
-/* Account for ACK, ACKing some data in Reno Recovery phase. */
-
-static void tcp_remove_reno_sacks(struct sock *sk, int acked)
-{
-   struct tcp_sock *tp = tcp_sk(sk);
-
-   if (acked  0) {
-   /* One ACK acked hole. The rest eat duplicate ACKs. */
-   if (acked-1 = tp-sacked_out)
-   tp-sacked_out = 0;
-   else
-   tp-sacked_out -= acked-1;
-   }
-   tcp_check_reno_reordering(sk, acked);
-   tcp_sync_left_out(tp);
-}
-
-static inline void tcp_reset_reno_sack(struct tcp_sock *tp)
-{
-   tp-sacked_out = 0;
-   tp-left_out = tp-lost_out;
-}
-
 /* RFC: This is from the original, I doubt that this is necessary at all:
  * clear xmit_retrans hint if seq of this skb is beyond hint. How could we
  * retransmitted past LOST markings in the first place? I'm not fully sure
-- 
1.5.0.6

-
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html

< 1 2 3 4 5 6 >

301 - 400 of 511 matches

Mail list logo