In this patch there is a collection of changes useful to have linux
tcp recovery close to rfc standard.

The linux kernel using this patch defaults to linux standard recovery,
when net.ipv4.tcp_rfcstrict_recovery=1 the changes in this patch are
enabled.

I've already discussed something like this here and I don't think the
best thing is to discuss whether linux should be or not close to
rfc's.

I send this patch only because in my tests this changes have helped me
to obtain the expected (and more performing) results.

The "rfcstrict" recovery reveals extremely more performing during Reno
recovery of large network drops.

Regards,
Angelo P. Castellani
diff -urd linux-2.6.16-orig/include/linux/sysctl.h linux-2.6.16-stdrecovery/include/linux/sysctl.h
--- linux-2.6.16-orig/include/linux/sysctl.h	2006-05-16 14:53:02.000000000 +0200
+++ linux-2.6.16-stdrecovery/include/linux/sysctl.h	2006-07-05 17:05:24.000000000 +0200
@@ -397,6 +397,7 @@
 	NET_TCP_CONG_CONTROL=110,
 	NET_TCP_ABC=111,
 	NET_IPV4_IPFRAG_MAX_DIST=112,
+	NET_TCP_RFCSTRICT_RECOVERY,
 };
 
 enum {
diff -urd linux-2.6.16-orig/include/net/tcp.h linux-2.6.16-stdrecovery/include/net/tcp.h
--- linux-2.6.16-orig/include/net/tcp.h	2006-05-16 14:53:02.000000000 +0200
+++ linux-2.6.16-stdrecovery/include/net/tcp.h	2006-07-05 17:06:41.000000000 +0200
@@ -219,6 +219,7 @@
 extern int sysctl_tcp_moderate_rcvbuf;
 extern int sysctl_tcp_tso_win_divisor;
 extern int sysctl_tcp_abc;
+extern int sysctl_tcp_rfcstrict_recovery;
 
 extern atomic_t tcp_memory_allocated;
 extern atomic_t tcp_sockets_allocated;
diff -urd linux-2.6.16-orig/net/ipv4/sysctl_net_ipv4.c linux-2.6.16-stdrecovery/net/ipv4/sysctl_net_ipv4.c
--- linux-2.6.16-orig/net/ipv4/sysctl_net_ipv4.c	2006-05-16 14:53:02.000000000 +0200
+++ linux-2.6.16-stdrecovery/net/ipv4/sysctl_net_ipv4.c	2006-07-05 17:08:31.000000000 +0200
@@ -664,6 +664,14 @@
 		.mode		= 0644,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= NET_TCP_RFCSTRICT_RECOVERY,
+		.procname	= "tcp_rfcstrict_recovery",
+		.data		= &sysctl_tcp_rfcstrict_recovery,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
 
 	{ .ctl_name = 0 }
 };
diff -urd linux-2.6.16-orig/net/ipv4/tcp_input.c linux-2.6.16-stdrecovery/net/ipv4/tcp_input.c
--- linux-2.6.16-orig/net/ipv4/tcp_input.c	2006-05-16 14:53:02.000000000 +0200
+++ linux-2.6.16-stdrecovery/net/ipv4/tcp_input.c	2006-07-05 17:26:41.000000000 +0200
@@ -91,6 +91,8 @@
 int sysctl_tcp_moderate_rcvbuf = 1;
 int sysctl_tcp_abc = 1;
 
+int sysctl_tcp_rfcstrict_recovery = 0;
+
 #define FLAG_DATA		0x01 /* Incoming frame contained data.		*/
 #define FLAG_WIN_UPDATE		0x02 /* Incoming ACK was a window update.	*/
 #define FLAG_DATA_ACKED		0x04 /* This ACK acknowledged new data.		*/
@@ -854,7 +856,8 @@
 				  const int ts)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	if (metric > tp->reordering) {
+	// rfcstrict: no dynamic reordering metric
+	if (!sysctl_tcp_rfcstrict_recovery && metric > tp->reordering) {
 		tp->reordering = min(TCP_MAX_REORDERING, metric);
 
 		/* This exciting event is worth to be remembered. 8) */
@@ -1784,7 +1787,10 @@
 		/* Hold old state until something *above* high_seq
 		 * is ACKed. For Reno it is MUST to prevent false
 		 * fast retransmits (RFC2582). SACK TCP is safe. */
-		tcp_moderate_cwnd(tp);
+		// rfcstrict: a tcp_moderate_cwnd at the end of the recovery
+		//                already solves any kind of burstiness issue
+		if (!sysctl_tcp_rfcstrict_recovery)
+			tcp_moderate_cwnd(tp);
 		return 1;
 	}
 	tcp_set_ca_state(sk, TCP_CA_Open);
@@ -2039,6 +2045,10 @@
 			if (!(flag&FLAG_ECE))
 				tp->prior_ssthresh = tcp_current_ssthresh(sk);
 			tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk);
+			// rfcstrict: standard rule cwnd = ssthresh + 3
+			// note: tp->reordering segments have been already added to sacked_out
+			if (sysctl_tcp_rfcstrict_recovery)
+				tp->snd_cwnd = tp->snd_ssthresh;
 			TCP_ECN_queue_cwr(tp);
 		}
 
@@ -2049,7 +2059,9 @@
 
 	if (is_dupack || tcp_head_timedout(sk, tp))
 		tcp_update_scoreboard(sk, tp);
-	tcp_cwnd_down(sk);
+	// rfcstrict: no further reduction other than cwnd = sshthresh + 3
+	if (!sysctl_tcp_rfcstrict_recovery || icsk->icsk_ca_state == TCP_CA_CWR)
+		tcp_cwnd_down(sk);
 	tcp_xmit_retransmit_queue(sk);
 }
 

Reply via email to