Module Name: src Committed By: he Date: Sat Feb 14 12:57:53 UTC 2015
Modified Files: src/share/man/man4: tcp.4 src/sys/netinet: tcp.h tcp_input.c tcp_output.c tcp_subr.c tcp_usrreq.c tcp_var.h Log Message: Port over the TCP_INFO socket option from FreeBSD, originally from the Linux 2.6 TCP API. This permits the caller to query certain information about a TCP connection, and is used by pkgsrc's net/iperf3 test program if available. This extends struct tcbcb with three fields to count retransmits, out-of-sequence receives and zero window announcements, and will therefore warrant a kernel revision bump (done separately). To generate a diff of this commit: cvs rdiff -u -r1.29 -r1.30 src/share/man/man4/tcp.4 cvs rdiff -u -r1.30 -r1.31 src/sys/netinet/tcp.h cvs rdiff -u -r1.335 -r1.336 src/sys/netinet/tcp_input.c cvs rdiff -u -r1.179 -r1.180 src/sys/netinet/tcp_output.c cvs rdiff -u -r1.257 -r1.258 src/sys/netinet/tcp_subr.c cvs rdiff -u -r1.202 -r1.203 src/sys/netinet/tcp_usrreq.c cvs rdiff -u -r1.175 -r1.176 src/sys/netinet/tcp_var.h Please note that diffs are not public domain; they are subject to the copyright notices on the relevant files.
Modified files: Index: src/share/man/man4/tcp.4 diff -u src/share/man/man4/tcp.4:1.29 src/share/man/man4/tcp.4:1.30 --- src/share/man/man4/tcp.4:1.29 Thu Oct 10 12:28:10 2013 +++ src/share/man/man4/tcp.4 Sat Feb 14 12:57:52 2015 @@ -1,4 +1,4 @@ -.\" $NetBSD: tcp.4,v 1.29 2013/10/10 12:28:10 christos Exp $ +.\" $NetBSD: tcp.4,v 1.30 2015/02/14 12:57:52 he Exp $ .\" $FreeBSD: tcp.4,v 1.11.2.16 2004/02/16 22:21:47 bms Exp $ .\" .\" Copyright (c) 1983, 1991, 1993 @@ -243,6 +243,23 @@ option value is inherited from the liste This option takes an .Vt "unsigned int" value, with a value greater than 0. +.It Dv TCP_INFO +Information about a socket's underlying TCP session may be retreived +by passing the read-only option +.Dv TPC_INFO +to +.Xr getsockopt 2 . +It accepts a single argument: a pointer to an instance of +.Vt "struct tcp_info" . +.Pp +This API is subject to change; consult the source to determine +which fields are currently filled out by this option. +.Nx +specific additions include +send window size, +receive window size, +and +bandwidth-controlled window space. .\" range of 0 to N (where N is the .\" .Xr sysctl 8 .\" variable Index: src/sys/netinet/tcp.h diff -u src/sys/netinet/tcp.h:1.30 src/sys/netinet/tcp.h:1.31 --- src/sys/netinet/tcp.h:1.30 Sat Jan 7 20:20:22 2012 +++ src/sys/netinet/tcp.h Sat Feb 14 12:57:53 2015 @@ -1,4 +1,4 @@ -/* $NetBSD: tcp.h,v 1.30 2012/01/07 20:20:22 christos Exp $ */ +/* $NetBSD: tcp.h,v 1.31 2015/02/14 12:57:53 he Exp $ */ /* * Copyright (c) 1982, 1986, 1993 @@ -127,7 +127,80 @@ struct tcphdr { #ifdef notyet #define TCP_NOOPT 8 /* reserved for FreeBSD compat */ #endif +#define TCP_INFO 9 /* retrieve tcp_info structure */ #define TCP_MD5SIG 0x10 /* use MD5 digests (RFC2385) */ #define TCP_CONGCTL 0x20 /* selected congestion control */ +#define TCPI_OPT_TIMESTAMPS 0x01 +#define TCPI_OPT_SACK 0x02 +#define TCPI_OPT_WSCALE 0x04 +#define TCPI_OPT_ECN 0x08 +#define TCPI_OPT_TOE 0x10 + +/* + * The TCP_INFO socket option comes from the Linux 2.6 TCP API, and permits + * the caller to query certain information about the state of a TCP + * connection. We provide an overlapping set of fields with the Linux + * implementation, but since this is a fixed size structure, room has been + * left for growth. In order to maximize potential future compatibility with + * the Linux API, the same variable names and order have been adopted, and + * padding left to make room for omitted fields in case they are added later. + * + * XXX: This is currently an unstable ABI/API, in that it is expected to + * change. + */ +struct tcp_info { + uint8_t tcpi_state; /* TCP FSM state. */ + uint8_t __tcpi_ca_state; + uint8_t __tcpi_retransmits; + uint8_t __tcpi_probes; + uint8_t __tcpi_backoff; + uint8_t tcpi_options; /* Options enabled on conn. */ + uint8_t tcpi_snd_wscale:4, /* RFC1323 send shift value. */ + tcpi_rcv_wscale:4; /* RFC1323 recv shift value. */ + + uint32_t tcpi_rto; /* Retransmission timeout (usec). */ + uint32_t __tcpi_ato; + uint32_t tcpi_snd_mss; /* Max segment size for send. */ + uint32_t tcpi_rcv_mss; /* Max segment size for receive. */ + + uint32_t __tcpi_unacked; + uint32_t __tcpi_sacked; + uint32_t __tcpi_lost; + uint32_t __tcpi_retrans; + uint32_t __tcpi_fackets; + + /* Times; measurements in usecs. */ + uint32_t __tcpi_last_data_sent; + uint32_t __tcpi_last_ack_sent; /* Also unimpl. on Linux? */ + uint32_t tcpi_last_data_recv; /* Time since last recv data. */ + uint32_t __tcpi_last_ack_recv; + + /* Metrics; variable units. */ + uint32_t __tcpi_pmtu; + uint32_t __tcpi_rcv_ssthresh; + uint32_t tcpi_rtt; /* Smoothed RTT in usecs. */ + uint32_t tcpi_rttvar; /* RTT variance in usecs. */ + uint32_t tcpi_snd_ssthresh; /* Slow start threshold. */ + uint32_t tcpi_snd_cwnd; /* Send congestion window. */ + uint32_t __tcpi_advmss; + uint32_t __tcpi_reordering; + + uint32_t __tcpi_rcv_rtt; + uint32_t tcpi_rcv_space; /* Advertised recv window. */ + + /* FreeBSD/NetBSD extensions to tcp_info. */ + uint32_t tcpi_snd_wnd; /* Advertised send window. */ + uint32_t tcpi_snd_bwnd; /* No longer used. */ + uint32_t tcpi_snd_nxt; /* Next egress seqno */ + uint32_t tcpi_rcv_nxt; /* Next ingress seqno */ + uint32_t tcpi_toe_tid; /* HWTID for TOE endpoints */ + uint32_t tcpi_snd_rexmitpack; /* Retransmitted packets */ + uint32_t tcpi_rcv_ooopack; /* Out-of-order packets */ + uint32_t tcpi_snd_zerowin; /* Zero-sized windows sent */ + + /* Padding to grow without breaking ABI. */ + uint32_t __tcpi_pad[26]; /* Padding. */ +}; + #endif /* !_NETINET_TCP_H_ */ Index: src/sys/netinet/tcp_input.c diff -u src/sys/netinet/tcp_input.c:1.335 src/sys/netinet/tcp_input.c:1.336 --- src/sys/netinet/tcp_input.c:1.335 Tue Dec 2 20:25:47 2014 +++ src/sys/netinet/tcp_input.c Sat Feb 14 12:57:53 2015 @@ -1,4 +1,4 @@ -/* $NetBSD: tcp_input.c,v 1.335 2014/12/02 20:25:47 christos Exp $ */ +/* $NetBSD: tcp_input.c,v 1.336 2015/02/14 12:57:53 he Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -148,7 +148,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.335 2014/12/02 20:25:47 christos Exp $"); +__KERNEL_RCSID(0, "$NetBSD: tcp_input.c,v 1.336 2015/02/14 12:57:53 he Exp $"); #include "opt_inet.h" #include "opt_ipsec.h" @@ -738,6 +738,7 @@ tcp_reass(struct tcpcb *tp, const struct /* * Update the counters. */ + tp->t_rcvoopack++; tcps = TCP_STAT_GETREF(); tcps[TCP_STAT_RCVOOPACK]++; tcps[TCP_STAT_RCVOOBYTE] += rcvoobyte; Index: src/sys/netinet/tcp_output.c diff -u src/sys/netinet/tcp_output.c:1.179 src/sys/netinet/tcp_output.c:1.180 --- src/sys/netinet/tcp_output.c:1.179 Mon Nov 10 18:52:51 2014 +++ src/sys/netinet/tcp_output.c Sat Feb 14 12:57:53 2015 @@ -1,4 +1,4 @@ -/* $NetBSD: tcp_output.c,v 1.179 2014/11/10 18:52:51 maxv Exp $ */ +/* $NetBSD: tcp_output.c,v 1.180 2015/02/14 12:57:53 he Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -135,7 +135,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.179 2014/11/10 18:52:51 maxv Exp $"); +__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.180 2015/02/14 12:57:53 he Exp $"); #include "opt_inet.h" #include "opt_ipsec.h" @@ -439,6 +439,7 @@ tcp_build_datapkt(struct tcpcb *tp, stru if (tp->t_force && len == 1) tcps[TCP_STAT_SNDPROBE]++; else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tp->t_sndrexmitpack++; tcps[TCP_STAT_SNDREXMITPACK]++; tcps[TCP_STAT_SNDREXMITBYTE] += len; } else { @@ -1401,6 +1402,9 @@ send: if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt)) win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt); th->th_win = htons((u_int16_t) (win>>tp->rcv_scale)); + if (th->th_win == 0) { + tp->t_sndzerowin++; + } if (SEQ_GT(tp->snd_up, tp->snd_nxt)) { u_int32_t urp = tp->snd_up - tp->snd_nxt; if (urp > IP_MAXPACKET) Index: src/sys/netinet/tcp_subr.c diff -u src/sys/netinet/tcp_subr.c:1.257 src/sys/netinet/tcp_subr.c:1.258 --- src/sys/netinet/tcp_subr.c:1.257 Mon Nov 10 18:52:51 2014 +++ src/sys/netinet/tcp_subr.c Sat Feb 14 12:57:53 2015 @@ -1,4 +1,4 @@ -/* $NetBSD: tcp_subr.c,v 1.257 2014/11/10 18:52:51 maxv Exp $ */ +/* $NetBSD: tcp_subr.c,v 1.258 2015/02/14 12:57:53 he Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -91,7 +91,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.257 2014/11/10 18:52:51 maxv Exp $"); +__KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.258 2015/02/14 12:57:53 he Exp $"); #include "opt_inet.h" #include "opt_ipsec.h" @@ -980,6 +980,9 @@ static struct tcpcb tcpcb_template = { .t_partialacks = -1, .t_bytes_acked = 0, + .t_sndrexmitpack = 0, + .t_rcvoopack = 0, + .t_sndzerowin = 0, }; /* Index: src/sys/netinet/tcp_usrreq.c diff -u src/sys/netinet/tcp_usrreq.c:1.202 src/sys/netinet/tcp_usrreq.c:1.203 --- src/sys/netinet/tcp_usrreq.c:1.202 Mon Nov 10 18:52:51 2014 +++ src/sys/netinet/tcp_usrreq.c Sat Feb 14 12:57:53 2015 @@ -1,4 +1,4 @@ -/* $NetBSD: tcp_usrreq.c,v 1.202 2014/11/10 18:52:51 maxv Exp $ */ +/* $NetBSD: tcp_usrreq.c,v 1.203 2015/02/14 12:57:53 he Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -99,7 +99,7 @@ */ #include <sys/cdefs.h> -__KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c,v 1.202 2014/11/10 18:52:51 maxv Exp $"); +__KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c,v 1.203 2015/02/14 12:57:53 he Exp $"); #include "opt_inet.h" #include "opt_ipsec.h" @@ -119,6 +119,7 @@ __KERNEL_RCSID(0, "$NetBSD: tcp_usrreq.c #include <sys/domain.h> #include <sys/sysctl.h> #include <sys/kauth.h> +#include <sys/kernel.h> #include <sys/uidinfo.h> #include <net/if.h> @@ -271,6 +272,65 @@ change_keepalive(struct socket *so, stru TCP_TIMER_ARM(tp, TCPT_2MSL, tp->t_maxidle); } +/* + * Export TCP internal state information via a struct tcp_info, based on the + * Linux 2.6 API. Not ABI compatible as our constants are mapped differently + * (TCP state machine, etc). We export all information using FreeBSD-native + * constants -- for example, the numeric values for tcpi_state will differ + * from Linux. + */ +static void +tcp_fill_info(struct tcpcb *tp, struct tcp_info *ti) +{ + + bzero(ti, sizeof(*ti)); + + ti->tcpi_state = tp->t_state; + if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) + ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; + if (tp->t_flags & TF_SACK_PERMIT) + ti->tcpi_options |= TCPI_OPT_SACK; + if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { + ti->tcpi_options |= TCPI_OPT_WSCALE; + ti->tcpi_snd_wscale = tp->snd_scale; + ti->tcpi_rcv_wscale = tp->rcv_scale; + } + if (tp->t_flags & TF_ECN_PERMIT) { + ti->tcpi_options |= TCPI_OPT_ECN; + } + + ti->tcpi_rto = tp->t_rxtcur * tick; + ti->tcpi_last_data_recv = (long)(hardclock_ticks - + (int)tp->t_rcvtime) * tick; + ti->tcpi_rtt = ((u_int64_t)tp->t_srtt * tick) >> TCP_RTT_SHIFT; + ti->tcpi_rttvar = ((u_int64_t)tp->t_rttvar * tick) >> TCP_RTTVAR_SHIFT; + + ti->tcpi_snd_ssthresh = tp->snd_ssthresh; + /* Linux API wants these in # of segments, apparently */ + ti->tcpi_snd_cwnd = tp->snd_cwnd / tp->t_segsz; + ti->tcpi_snd_wnd = tp->snd_wnd / tp->t_segsz; + + /* + * FreeBSD-specific extension fields for tcp_info. + */ + ti->tcpi_rcv_space = tp->rcv_wnd; + ti->tcpi_rcv_nxt = tp->rcv_nxt; + ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ + ti->tcpi_snd_nxt = tp->snd_nxt; + ti->tcpi_snd_mss = tp->t_segsz; + ti->tcpi_rcv_mss = tp->t_segsz; +#ifdef TF_TOE + if (tp->t_flags & TF_TOE) + ti->tcpi_options |= TCPI_OPT_TOE; +#endif + /* From the redundant department of redundancies... */ + ti->__tcpi_retransmits = ti->__tcpi_retrans = + ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; + + ti->tcpi_rcv_ooopack = tp->t_rcvoopack; + ti->tcpi_snd_zerowin = tp->t_sndzerowin; +} + int tcp_ctloutput(int op, struct socket *so, struct sockopt *sopt) { @@ -280,6 +340,7 @@ tcp_ctloutput(int op, struct socket *so, struct in6pcb *in6p; #endif struct tcpcb *tp; + struct tcp_info ti; u_int ui; int family; /* family of the socket */ int level, optname, optval; @@ -450,6 +511,10 @@ tcp_ctloutput(int op, struct socket *so, optval = tp->t_peermss; error = sockopt_set(sopt, &optval, sizeof(optval)); break; + case TCP_INFO: + tcp_fill_info(tp, &ti); + error = sockopt_set(sopt, &ti, sizeof ti); + break; #ifdef notyet case TCP_CONGCTL: break; Index: src/sys/netinet/tcp_var.h diff -u src/sys/netinet/tcp_var.h:1.175 src/sys/netinet/tcp_var.h:1.176 --- src/sys/netinet/tcp_var.h:1.175 Thu Jul 31 03:39:35 2014 +++ src/sys/netinet/tcp_var.h Sat Feb 14 12:57:53 2015 @@ -1,4 +1,4 @@ -/* $NetBSD: tcp_var.h,v 1.175 2014/07/31 03:39:35 rtr Exp $ */ +/* $NetBSD: tcp_var.h,v 1.176 2015/02/14 12:57:53 he Exp $ */ /* * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project. @@ -364,6 +364,11 @@ struct tcpcb { u_int t_maxidle; /* t_keepcnt * t_keepintvl */ u_int t_msl; /* MSL to use for this connexion */ + + /* maintain a few stats per connection: */ + int t_rcvoopack; /* out-of-order packets received */ + int t_sndrexmitpack; /* retransmit packets sent */ + int t_sndzerowin; /* zero-window updates sent */ }; /*