Implements the SACK-enhanced FRTO given in RFC4138 using the variant given in Appendix B.
RFC4138, Appendix B: "This means that in order to declare timeout spurious, the TCP sender must receive an acknowledgment for non-retransmitted segment between SND.UNA and RecoveryPoint in algorithm step 3. RecoveryPoint is defined in conservative SACK-recovery algorithm [RFC3517]" The basic version of the FRTO algorithm can still be used also when SACK is enabled. To enabled SACK-enhanced version, tcp_frto sysctl is set to 2. Signed-off-by: Ilpo Järvinen <[EMAIL PROTECTED]> --- net/ipv4/tcp_input.c | 76 +++++++++++++++++++++++++++++++++++++++++++------- 1 files changed, 65 insertions(+), 11 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 356de02..3ce4019 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -100,6 +100,7 @@ #define FLAG_DATA_SACKED 0x20 /* New SAC #define FLAG_ECE 0x40 /* ECE in this ACK */ #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ +#define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) @@ -110,6 +111,8 @@ #define IsReno(tp) ((tp)->rx_opt.sack_ok #define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) #define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) +#define IsSackFrto() (sysctl_tcp_frto == 0x2) + #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) /* Adapt the MSS value used to make delayed ack decision to the @@ -1159,6 +1162,18 @@ tcp_sacktag_write_queue(struct sock *sk, /* clear lost hint */ tp->retransmit_skb_hint = NULL; } + /* SACK enhanced F-RTO detection. + * Set flag if and only if non-rexmitted + * segments below frto_highmark are + * SACKed (RFC4138; Appendix B). + * Clearing correct due to in-order walk + */ + if (after(end_seq, tp->frto_highmark)) { + flag &= ~FLAG_ONLY_ORIG_SACKED; + } else { + if (!(sacked & TCPCB_RETRANS)) + flag |= FLAG_ONLY_ORIG_SACKED; + } } TCP_SKB_CB(skb)->sacked |= TCPCB_SACKED_ACKED; @@ -1240,7 +1255,8 @@ #endif /* F-RTO can only be used if these conditions are satisfied: * - there must be some unsent new data * - the advertised window should allow sending it - * - TCP has never retransmitted anything other than head + * - TCP has never retransmitted anything other than head (SACK enhanced + * variant from Appendix B of RFC4138 is more robust here) */ int tcp_use_frto(struct sock *sk) { @@ -1252,6 +1268,9 @@ int tcp_use_frto(struct sock *sk) tp->snd_una + tp->snd_wnd)) return 0; + if (IsSackFrto()) + return 1; + /* Avoid expensive walking of rexmit queue if possible */ if (tp->retrans_out > 1) return 0; @@ -1328,9 +1347,18 @@ void tcp_enter_frto(struct sock *sk) } tcp_sync_left_out(tp); + /* Earlier loss recovery underway (see RFC4138; Appendix B). + * The last condition is necessary at least in tp->frto_counter case. + */ + if (IsSackFrto() && (tp->frto_counter || + ((1 << icsk->icsk_ca_state) & (TCPF_CA_Recovery|TCPF_CA_Loss))) && + after(tp->high_seq, tp->snd_una)) { + tp->frto_highmark = tp->high_seq; + } else { + tp->frto_highmark = tp->snd_nxt; + } tcp_set_ca_state(sk, TCP_CA_Disorder); tp->high_seq = tp->snd_nxt; - tp->frto_highmark = tp->snd_nxt; tp->frto_counter = 1; } @@ -2566,6 +2594,10 @@ static void tcp_conservative_spur_to_res * Rationale: if the RTO was spurious, new ACKs should arrive from the * original window even after we transmit two new data segments. * + * SACK version: + * on first step, wait until first cumulative ACK arrives, then move to + * the second step. In second step, the next ACK decides. + * * F-RTO is implemented (mainly) in four functions: * - tcp_use_frto() is used to determine if TCP is can use F-RTO * - tcp_enter_frto() prepares TCP state on RTO if F-RTO is used, it is @@ -2590,16 +2622,38 @@ static int tcp_process_frto(struct sock return 1; } - /* RFC4138 shortcoming in step 2; should also have case c): ACK isn't - * duplicate nor advances window, e.g., opposite dir data, winupdate - */ - if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) && - !(flag&FLAG_FORWARD_PROGRESS)) - return 1; + if (!IsSackFrto() || IsReno(tp)) { + /* RFC4138 shortcoming in step 2; should also have case c): + * ACK isn't duplicate nor advances window, e.g., opposite dir + * data, winupdate + */ + if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) && + !(flag&FLAG_FORWARD_PROGRESS)) + return 1; - if (!(flag&FLAG_DATA_ACKED)) { - tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), flag); - return 1; + if (!(flag&FLAG_DATA_ACKED)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 0 : 3), + flag); + return 1; + } + } else { + if (!(flag&FLAG_DATA_ACKED) && (tp->frto_counter == 1)) { + /* Prevent sending of new data. */ + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp)); + return 1; + } + + if ((tp->frto_counter == 2) && + (!(flag&FLAG_FORWARD_PROGRESS) || + ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { + /* RFC4138 shortcoming (see comment above) */ + if (!(flag&FLAG_FORWARD_PROGRESS) && (flag&FLAG_NOT_DUP)) + return 1; + + tcp_enter_frto_loss(sk, 3, flag); + return 1; + } } if (tp->frto_counter == 1) { -- 1.4.2 - To unsubscribe from this list: send the line "unsubscribe netdev" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html