Hi all, I'm a student doing a thesis about TCP performance over high BDP links and so about congestion control in TCP.
To do this work I've built a testbed using the latest Linux release (2.6.16). Anyway I've came across the fact that Linux TCP implementation isn't fully standard compliant. Even if the choices made to be different from the standards have been wisely thought, I think that should be possible to disable these Linuxisms. Surely this can help all the people using Linux to evaluate a "standard" environment. Moreover it permits to compare the pros&cons of the Linux implementation against the standard one. So I've disabled the first two Linux-specific mechanisms I've found: - rate halving - dynamic reordering metric (dynamic DupThresh) These're disabled as long as net.ipv4.tcp_standard_compliant=1 (default: 0). However I don't exclude that there're more non-standard details, so I hope that somebody can point some more differences between Linux and the RFCs. Moreover NewReno is implemented in the Impatient variant (resets the retransmit timer only on the first partial ack), with net.ipv4.tcp_slow_but_steady=1 (default: 0) you can enable the Slow-but-Steady variant (resets the retransmit timer every partial ack). Hoping that this can be useful, I attach the patch. Regards, Angelo P. Castellani
diff -urd ../linux-2.6.16-orig/include/linux/sysctl.h ./include/linux/sysctl.h --- ../linux-2.6.16-orig/include/linux/sysctl.h 2006-05-16 14:53:02.000000000 +0200 +++ ./include/linux/sysctl.h 2006-05-16 14:54:50.000000000 +0200 @@ -397,6 +397,8 @@ NET_TCP_CONG_CONTROL=110, NET_TCP_ABC=111, NET_IPV4_IPFRAG_MAX_DIST=112, + NET_TCP_STANDARD_COMPLIANT, + NET_TCP_SLOW_BUT_STEADY, }; enum { diff -urd ../linux-2.6.16-orig/include/net/tcp.h ./include/net/tcp.h --- ../linux-2.6.16-orig/include/net/tcp.h 2006-05-16 14:53:02.000000000 +0200 +++ ./include/net/tcp.h 2006-05-16 14:55:43.000000000 +0200 @@ -219,6 +219,8 @@ extern int sysctl_tcp_moderate_rcvbuf; extern int sysctl_tcp_tso_win_divisor; extern int sysctl_tcp_abc; +extern int sysctl_tcp_standard_compliant; +extern int sysctl_tcp_slow_but_steady; extern atomic_t tcp_memory_allocated; extern atomic_t tcp_sockets_allocated; diff -urd ../linux-2.6.16-orig/net/ipv4/sysctl_net_ipv4.c ./net/ipv4/sysctl_net_ipv4.c --- ../linux-2.6.16-orig/net/ipv4/sysctl_net_ipv4.c 2006-05-16 14:53:02.000000000 +0200 +++ ./net/ipv4/sysctl_net_ipv4.c 2006-05-16 14:57:23.000000000 +0200 @@ -664,6 +664,22 @@ .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = NET_TCP_STANDARD_COMPLIANT, + .procname = "tcp_standard_compliant", + .data = &sysctl_tcp_standard_compliant, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = NET_TCP_SLOW_BUT_STEADY, + .procname = "tcp_slow_but_steady", + .data = &sysctl_tcp_slow_but_steady, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, { .ctl_name = 0 } }; diff -urd ../linux-2.6.16-orig/net/ipv4/tcp_input.c ./net/ipv4/tcp_input.c --- ../linux-2.6.16-orig/net/ipv4/tcp_input.c 2006-05-16 14:53:02.000000000 +0200 +++ ./net/ipv4/tcp_input.c 2006-05-16 14:52:43.000000000 +0200 @@ -81,6 +81,7 @@ int sysctl_tcp_dsack = 1; int sysctl_tcp_app_win = 31; int sysctl_tcp_adv_win_scale = 2; +int sysctl_tcp_standard_compliant = 0; int sysctl_tcp_stdurg; int sysctl_tcp_rfc1337; @@ -854,7 +855,7 @@ const int ts) { struct tcp_sock *tp = tcp_sk(sk); - if (metric > tp->reordering) { + if (!sysctl_tcp_standard_compliant && metric > tp->reordering) { tp->reordering = min(TCP_MAX_REORDERING, metric); /* This exciting event is worth to be remembered. 8) */ @@ -2039,6 +2040,8 @@ if (!(flag&FLAG_ECE)) tp->prior_ssthresh = tcp_current_ssthresh(sk); tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); + if (sysctl_tcp_standard_compliant) + tp->snd_cwnd = tp->snd_ssthresh; // tp->reordering segments should've been already added to sacked_out TCP_ECN_queue_cwr(tp); } @@ -2049,7 +2052,8 @@ if (is_dupack || tcp_head_timedout(sk, tp)) tcp_update_scoreboard(sk, tp); - tcp_cwnd_down(sk); + if (!sysctl_tcp_standard_compliant || icsk->icsk_ca_state == TCP_CA_CWR) + tcp_cwnd_down(sk); tcp_xmit_retransmit_queue(sk); } diff -urd ../linux-2.6.16-orig/net/ipv4/tcp_output.c ./net/ipv4/tcp_output.c --- ../linux-2.6.16-orig/net/ipv4/tcp_output.c 2006-05-16 14:53:02.000000000 +0200 +++ ./net/ipv4/tcp_output.c 2006-05-16 14:52:43.000000000 +0200 @@ -51,6 +51,9 @@ */ int sysctl_tcp_tso_win_divisor = 3; +/* Enables the Slow-but-Steady variant of NewReno (cfr. RFC2582 Ch.4) */ +int sysctl_tcp_slow_but_steady = 0; + static void update_send_head(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) { @@ -1604,7 +1607,7 @@ else NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); - if (skb == + if (sysctl_tcp_slow_but_steady || skb == skb_peek(&sk->sk_write_queue)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto,