Module Name:    src
Committed By:   he
Date:           Sat Feb 14 12:57:53 UTC 2015

Modified Files:
        src/share/man/man4: tcp.4
        src/sys/netinet: tcp.h tcp_input.c tcp_output.c tcp_subr.c tcp_usrreq.c
            tcp_var.h

Log Message:
Port over the TCP_INFO socket option from FreeBSD, originally from
the Linux 2.6 TCP API.  This permits the caller to query certain information
about a TCP connection, and is used by pkgsrc's net/iperf3 test program
if available.

This extends struct tcbcb with three fields to count retransmits,
out-of-sequence receives and zero window announcements, and will
therefore warrant a kernel revision bump (done separately).


To generate a diff of this commit:
cvs rdiff -u -r1.29 -r1.30 src/share/man/man4/tcp.4
cvs rdiff -u -r1.30 -r1.31 src/sys/netinet/tcp.h
cvs rdiff -u -r1.335 -r1.336 src/sys/netinet/tcp_input.c
cvs rdiff -u -r1.179 -r1.180 src/sys/netinet/tcp_output.c
cvs rdiff -u -r1.257 -r1.258 src/sys/netinet/tcp_subr.c
cvs rdiff -u -r1.202 -r1.203 src/sys/netinet/tcp_usrreq.c
cvs rdiff -u -r1.175 -r1.176 src/sys/netinet/tcp_var.h

Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.

Modified files:

Index: src/share/man/man4/tcp.4
diff -u src/share/man/man4/tcp.4:1.29 src/share/man/man4/tcp.4:1.30
--- src/share/man/man4/tcp.4:1.29	Thu Oct 10 12:28:10 2013
+++ src/share/man/man4/tcp.4	Sat Feb 14 12:57:52 2015
@@ -1,4 +1,4 @@
-.\"	$NetBSD: tcp.4,v 1.29 2013/10/10 12:28:10 christos Exp $
+.\"	$NetBSD: tcp.4,v 1.30 2015/02/14 12:57:52 he Exp $
 .\"	$FreeBSD: tcp.4,v 1.11.2.16 2004/02/16 22:21:47 bms Exp $
 .\"
 .\" Copyright (c) 1983, 1991, 1993
@@ -243,6 +243,23 @@ option value is inherited from the liste
 This option takes an
 .Vt "unsigned int"
 value, with a value greater than 0.
+.It Dv TCP_INFO
+Information about a socket's underlying TCP session may be retreived
+by passing the read-only option
+.Dv TPC_INFO
+to 
+.Xr getsockopt 2 .
+It accepts a single argument: a pointer to an instance of
+.Vt "struct tcp_info" .
+.Pp
+This API is subject to change; consult the source to determine
+which fields are currently filled out by this option.
+.Nx
+specific additions include
+send window size,
+receive window size,
+and
+bandwidth-controlled window space.
 .\" range of 0 to N (where N is the
 .\" .Xr sysctl 8
 .\" variable

Index: src/sys/netinet/tcp.h
diff -u src/sys/netinet/tcp.h:1.30 src/sys/netinet/tcp.h:1.31
--- src/sys/netinet/tcp.h:1.30	Sat Jan  7 20:20:22 2012
+++ src/sys/netinet/tcp.h	Sat Feb 14 12:57:53 2015
@@ -1,4 +1,4 @@
-/*	$NetBSD: tcp.h,v 1.30 2012/01/07 20:20:22 christos Exp $	*/
+/*	$NetBSD: tcp.h,v 1.31 2015/02/14 12:57:53 he Exp $	*/
 
 /*
  * Copyright (c) 1982, 1986, 1993
@@ -127,7 +127,80 @@ struct tcphdr {
 #ifdef notyet
 #define	TCP_NOOPT	8	/* reserved for FreeBSD compat */
 #endif
+#define	TCP_INFO	9	/* retrieve tcp_info structure */
 #define	TCP_MD5SIG	0x10	/* use MD5 digests (RFC2385) */
 #define	TCP_CONGCTL	0x20	/* selected congestion control */
 
+#define	TCPI_OPT_TIMESTAMPS	0x01
+#define	TCPI_OPT_SACK		0x02
+#define	TCPI_OPT_WSCALE		0x04
+#define	TCPI_OPT_ECN		0x08
+#define	TCPI_OPT_TOE		0x10
+
+/*
+ * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits
+ * the caller to query certain information about the state of a TCP
+ * connection.  We provide an overlapping set of fields with the Linux
+ * implementation, but since this is a fixed size structure, room has been
+ * left for growth.  In order to maximize potential future compatibility with
+ * the Linux API, the same variable names and order have been adopted, and
+ * padding left to make room for omitted fields in case they are added later.
+ *
+ * XXX: This is currently an unstable ABI/API, in that it is expected to
+ * change.
+ */
+struct tcp_info {
+	uint8_t		tcpi_state; /* TCP FSM state. */
+	uint8_t		__tcpi_ca_state;
+	uint8_t		__tcpi_retransmits;
+	uint8_t		__tcpi_probes;
+	uint8_t		__tcpi_backoff;
+	uint8_t		tcpi_options;	       /* Options enabled on conn. */
+	uint8_t		tcpi_snd_wscale:4,	/* RFC1323 send shift value. */
+			tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */
+
+	uint32_t	tcpi_rto;		/* Retransmission timeout (usec). */
+	uint32_t	__tcpi_ato;
+	uint32_t	tcpi_snd_mss;		/* Max segment size for send. */
+	uint32_t	tcpi_rcv_mss;		/* Max segment size for receive. */
+
+	uint32_t	__tcpi_unacked;
+	uint32_t	__tcpi_sacked;
+	uint32_t	__tcpi_lost;
+	uint32_t	__tcpi_retrans;
+	uint32_t	__tcpi_fackets;
+
+	/* Times; measurements in usecs. */
+	uint32_t	__tcpi_last_data_sent;
+	uint32_t	__tcpi_last_ack_sent;	/* Also unimpl. on Linux? */
+	uint32_t	tcpi_last_data_recv;	/* Time since last recv data. */
+	uint32_t	__tcpi_last_ack_recv;
+
+	/* Metrics; variable units. */
+	uint32_t	__tcpi_pmtu;
+	uint32_t	__tcpi_rcv_ssthresh;
+	uint32_t	tcpi_rtt;		/* Smoothed RTT in usecs. */
+	uint32_t	tcpi_rttvar;		/* RTT variance in usecs. */
+	uint32_t	tcpi_snd_ssthresh;	/* Slow start threshold. */
+	uint32_t	tcpi_snd_cwnd;		/* Send congestion window. */
+	uint32_t	__tcpi_advmss;
+	uint32_t	__tcpi_reordering;
+
+	uint32_t	__tcpi_rcv_rtt;
+	uint32_t	tcpi_rcv_space;		/* Advertised recv window. */
+
+	/* FreeBSD/NetBSD extensions to tcp_info. */
+	uint32_t	tcpi_snd_wnd;		/* Advertised send window. */
+	uint32_t	tcpi_snd_bwnd;		/* No longer used. */
+	uint32_t	tcpi_snd_nxt;		/* Next egress seqno */
+	uint32_t	tcpi_rcv_nxt;		/* Next ingress seqno */
+	uint32_t	tcpi_toe_tid;		/* HWTID for TOE endpoints */
+	uint32_t	tcpi_snd_rexmitpack;	/* Retransmitted packets */
+	uint32_t	tcpi_rcv_ooopack;	/* Out-of-order packets */
+	uint32_t	tcpi_snd_zerowin;	/* Zero-sized windows sent */
+	
+	/* Padding to grow without breaking ABI. */
+	uint32_t	__tcpi_pad[26];		/* Padding. */
+};
+
 #endif /* !_NETINET_TCP_H_ */

Index: src/sys/netinet/tcp_input.c
diff -u src/sys/netinet/tcp_input.c:1.335 src/sys/netinet/tcp_input.c:1.336
--- src/sys/netinet/tcp_input.c:1.335	Tue Dec  2 20:25:47 2014
+++ src/sys/netinet/tcp_input.c	Sat Feb 14 12:57:53 2015
@@ -1,4 +1,4 @@
-/*	$NetBSD: tcp_input.c,v 1.335 2014/12/02 20:25:47 christos Exp $	*/
+/*	$NetBSD: tcp_input.c,v 1.336 2015/02/14 12:57:53 he Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@@ -148,7 +148,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.335 2014/12/02 20:25:47 christos Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.336 2015/02/14 12:57:53 he Exp $");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
@@ -738,6 +738,7 @@ tcp_reass(struct tcpcb *tp, const struct
 	/*
 	 * Update the counters.
 	 */
+	tp->t_rcvoopack++;
 	tcps = TCP_STAT_GETREF();
 	tcps[TCP_STAT_RCVOOPACK]++;
 	tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte;

Index: src/sys/netinet/tcp_output.c
diff -u src/sys/netinet/tcp_output.c:1.179 src/sys/netinet/tcp_output.c:1.180
--- src/sys/netinet/tcp_output.c:1.179	Mon Nov 10 18:52:51 2014
+++ src/sys/netinet/tcp_output.c	Sat Feb 14 12:57:53 2015
@@ -1,4 +1,4 @@
-/*	$NetBSD: tcp_output.c,v 1.179 2014/11/10 18:52:51 maxv Exp $	*/
+/*	$NetBSD: tcp_output.c,v 1.180 2015/02/14 12:57:53 he Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@@ -135,7 +135,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.179 2014/11/10 18:52:51 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.180 2015/02/14 12:57:53 he Exp $");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
@@ -439,6 +439,7 @@ tcp_build_datapkt(struct tcpcb *tp, stru
 	if (tp->t_force && len == 1)
 		tcps[TCP_STAT_SNDPROBE]++;
 	else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
+		tp->t_sndrexmitpack++;
 		tcps[TCP_STAT_SNDREXMITPACK]++;
 		tcps[TCP_STAT_SNDREXMITBYTE] += len;
 	} else {
@@ -1401,6 +1402,9 @@ send:
 	if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
 		win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
 	th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
+	if (th->th_win == 0) {
+		tp->t_sndzerowin++;
+	}
 	if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
 		u_int32_t urp = tp->snd_up - tp->snd_nxt;
 		if (urp > IP_MAXPACKET)

Index: src/sys/netinet/tcp_subr.c
diff -u src/sys/netinet/tcp_subr.c:1.257 src/sys/netinet/tcp_subr.c:1.258
--- src/sys/netinet/tcp_subr.c:1.257	Mon Nov 10 18:52:51 2014
+++ src/sys/netinet/tcp_subr.c	Sat Feb 14 12:57:53 2015
@@ -1,4 +1,4 @@
-/*	$NetBSD: tcp_subr.c,v 1.257 2014/11/10 18:52:51 maxv Exp $	*/
+/*	$NetBSD: tcp_subr.c,v 1.258 2015/02/14 12:57:53 he Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@@ -91,7 +91,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.257 2014/11/10 18:52:51 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.258 2015/02/14 12:57:53 he Exp $");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
@@ -980,6 +980,9 @@ static struct tcpcb tcpcb_template = {
 
 	.t_partialacks = -1,
 	.t_bytes_acked = 0,
+	.t_sndrexmitpack = 0,
+	.t_rcvoopack = 0,
+	.t_sndzerowin = 0,
 };
 
 /*

Index: src/sys/netinet/tcp_usrreq.c
diff -u src/sys/netinet/tcp_usrreq.c:1.202 src/sys/netinet/tcp_usrreq.c:1.203
--- src/sys/netinet/tcp_usrreq.c:1.202	Mon Nov 10 18:52:51 2014
+++ src/sys/netinet/tcp_usrreq.c	Sat Feb 14 12:57:53 2015
@@ -1,4 +1,4 @@
-/*	$NetBSD: tcp_usrreq.c,v 1.202 2014/11/10 18:52:51 maxv Exp $	*/
+/*	$NetBSD: tcp_usrreq.c,v 1.203 2015/02/14 12:57:53 he Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@@ -99,7 +99,7 @@
  */
 
 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c,v 1.202 2014/11/10 18:52:51 maxv Exp $");
+__KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c,v 1.203 2015/02/14 12:57:53 he Exp $");
 
 #include "opt_inet.h"
 #include "opt_ipsec.h"
@@ -119,6 +119,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c
 #include <sys/domain.h>
 #include <sys/sysctl.h>
 #include <sys/kauth.h>
+#include <sys/kernel.h>
 #include <sys/uidinfo.h>
 
 #include <net/if.h>
@@ -271,6 +272,65 @@ change_keepalive(struct socket *so, stru
 		TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle);
 }
 
+/*
+ * Export TCP internal state information via a struct tcp_info, based on the
+ * Linux 2.6 API.  Not ABI compatible as our constants are mapped differently
+ * (TCP state machine, etc).  We export all information using FreeBSD-native
+ * constants -- for example, the numeric values for tcpi_state will differ
+ * from Linux.
+ */
+static void
+tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti)
+{
+
+	bzero(ti, sizeof(*ti));
+
+	ti->tcpi_state = tp->t_state;
+	if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP))
+		ti->tcpi_options |= TCPI_OPT_TIMESTAMPS;
+	if (tp->t_flags & TF_SACK_PERMIT)
+		ti->tcpi_options |= TCPI_OPT_SACK;
+	if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) {
+		ti->tcpi_options |= TCPI_OPT_WSCALE;
+		ti->tcpi_snd_wscale = tp->snd_scale;
+		ti->tcpi_rcv_wscale = tp->rcv_scale;
+	}
+	if (tp->t_flags & TF_ECN_PERMIT) {
+		ti->tcpi_options |= TCPI_OPT_ECN;
+	}
+
+	ti->tcpi_rto = tp->t_rxtcur * tick;
+	ti->tcpi_last_data_recv = (long)(hardclock_ticks -
+					 (int)tp->t_rcvtime) * tick;
+	ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT;
+	ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT;
+
+	ti->tcpi_snd_ssthresh = tp->snd_ssthresh;
+	/* Linux API wants these in # of segments, apparently */
+	ti->tcpi_snd_cwnd = tp->snd_cwnd / tp->t_segsz;
+	ti->tcpi_snd_wnd = tp->snd_wnd / tp->t_segsz;
+
+	/*
+	 * FreeBSD-specific extension fields for tcp_info.
+	 */
+	ti->tcpi_rcv_space = tp->rcv_wnd;
+	ti->tcpi_rcv_nxt = tp->rcv_nxt;
+	ti->tcpi_snd_bwnd = 0;		/* Unused, kept for compat. */
+	ti->tcpi_snd_nxt = tp->snd_nxt;
+	ti->tcpi_snd_mss = tp->t_segsz;
+	ti->tcpi_rcv_mss = tp->t_segsz;
+#ifdef TF_TOE
+	if (tp->t_flags & TF_TOE)
+		ti->tcpi_options |= TCPI_OPT_TOE;
+#endif
+	/* From the redundant department of redundancies... */
+	ti->__tcpi_retransmits = ti->__tcpi_retrans =
+		ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack;
+
+	ti->tcpi_rcv_ooopack = tp->t_rcvoopack;
+	ti->tcpi_snd_zerowin = tp->t_sndzerowin;
+}
+
 int
 tcp_ctloutput(int op, struct socket *so, struct sockopt *sopt)
 {
@@ -280,6 +340,7 @@ tcp_ctloutput(int op, struct socket *so,
 	struct in6pcb *in6p;
 #endif
 	struct tcpcb *tp;
+	struct tcp_info ti;
 	u_int ui;
 	int family;	/* family of the socket */
 	int level, optname, optval;
@@ -450,6 +511,10 @@ tcp_ctloutput(int op, struct socket *so,
 			optval = tp->t_peermss;
 			error = sockopt_set(sopt, &optval, sizeof(optval));
 			break;
+		case TCP_INFO:
+			tcp_fill_info(tp, &ti);
+			error = sockopt_set(sopt, &ti, sizeof ti);
+			break;
 #ifdef notyet
 		case TCP_CONGCTL:
 			break;

Index: src/sys/netinet/tcp_var.h
diff -u src/sys/netinet/tcp_var.h:1.175 src/sys/netinet/tcp_var.h:1.176
--- src/sys/netinet/tcp_var.h:1.175	Thu Jul 31 03:39:35 2014
+++ src/sys/netinet/tcp_var.h	Sat Feb 14 12:57:53 2015
@@ -1,4 +1,4 @@
-/*	$NetBSD: tcp_var.h,v 1.175 2014/07/31 03:39:35 rtr Exp $	*/
+/*	$NetBSD: tcp_var.h,v 1.176 2015/02/14 12:57:53 he Exp $	*/
 
 /*
  * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@@ -364,6 +364,11 @@ struct tcpcb {
 	u_int	t_maxidle;		/* t_keepcnt * t_keepintvl */
 
 	u_int	t_msl;			/* MSL to use for this connexion */
+
+	/* maintain a few stats per connection: */
+	int	t_rcvoopack;	 	/* out-of-order packets received */
+	int	t_sndrexmitpack; 	/* retransmit packets sent */
+	int	t_sndzerowin;		/* zero-window updates sent */
 };
 
 /*

Reply via email to