tcp_stacks

Randall Stewart Tue, 15 Dec 2015 16:57:07 -0800

Author: rrs
Date: Wed Dec 16 00:56:45 2015
New Revision: 292309
URL: https://svnweb.freebsd.org/changeset/base/292309


Log:
  First cut of the modularization of our TCP stack. Still
  to do is to clean up the timer handling using the async-drain.
  Other optimizations may be coming to go with this. Whats here
  will allow differnet tcp implementations (one included).
  Reviewed by:  jtl, hiren, transports
  Sponsored by: Netflix Inc.
  Differential Revision:        D4055

Added:
  head/sys/modules/tcp/
  head/sys/modules/tcp/fastpath/
  head/sys/modules/tcp/fastpath/Makefile   (contents, props changed)
  head/sys/netinet/tcp_stacks/
  head/sys/netinet/tcp_stacks/fastpath.c   (contents, props changed)
Modified:
  head/sys/modules/Makefile
  head/sys/netinet/tcp.h
  head/sys/netinet/tcp_input.c
  head/sys/netinet/tcp_sack.c
  head/sys/netinet/tcp_subr.c
  head/sys/netinet/tcp_syncache.c
  head/sys/netinet/tcp_timer.c
  head/sys/netinet/tcp_usrreq.c
  head/sys/netinet/tcp_var.h
  head/sys/netinet/toecore.c

Modified: head/sys/modules/Makefile
==============================================================================
--- head/sys/modules/Makefile   Wed Dec 16 00:56:38 2015        (r292308)
+++ head/sys/modules/Makefile   Wed Dec 16 00:56:45 2015        (r292309)
@@ -346,6 +346,7 @@ SUBDIR=     \
        ${_syscons} \
        sysvipc \
        ${_ti} \
+       tcp/fastpath \
        tests/framework \
        tests/callout_test \
        tl \

Added: head/sys/modules/tcp/fastpath/Makefile
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/sys/modules/tcp/fastpath/Makefile      Wed Dec 16 00:56:45 2015        
(r292309)
@@ -0,0 +1,15 @@
+#
+# $FreeBSD$
+#
+
+.PATH: ${.CURDIR}/../../../netinet/tcp_stacks
+
+KMOD=  fastpath
+SRCS=  fastpath.c
+
+#
+# Enable full debugging
+#
+#CFLAGS += -g
+
+.include <bsd.kmod.mk>

Modified: head/sys/netinet/tcp.h
==============================================================================
--- head/sys/netinet/tcp.h      Wed Dec 16 00:56:38 2015        (r292308)
+++ head/sys/netinet/tcp.h      Wed Dec 16 00:56:45 2015        (r292309)
@@ -167,7 +167,7 @@ struct tcphdr {
 #define        TCP_KEEPCNT     1024    /* L,N number of keepalives before 
close */
 #define        TCP_PCAP_OUT    2048    /* number of output packets to keep */
 #define        TCP_PCAP_IN     4096    /* number of input packets to keep */
-
+#define TCP_FUNCTION_BLK 8192  /* Set the tcp function pointers to the 
specified stack */
 /* Start of reserved space for third-party user-settable options. */
 #define        TCP_VENDOR      SO_VENDOR
 
@@ -245,5 +245,11 @@ struct tcp_info {
        u_int32_t       __tcpi_pad[26];         /* Padding. */
 };
 #endif
+#define TCP_FUNCTION_NAME_LEN_MAX 32
+
+struct tcp_function_set {
+       char function_set_name[TCP_FUNCTION_NAME_LEN_MAX];
+       uint32_t pcbcnt;
+};
 
 #endif /* !_NETINET_TCP_H_ */

Modified: head/sys/netinet/tcp_input.c
==============================================================================
--- head/sys/netinet/tcp_input.c        Wed Dec 16 00:56:38 2015        
(r292308)
+++ head/sys/netinet/tcp_input.c        Wed Dec 16 00:56:45 2015        
(r292309)
@@ -230,23 +230,6 @@ VNET_DEFINE(struct inpcbhead, tcb);
 #define        tcb6    tcb  /* for KAME src sync over BSD*'s */
 VNET_DEFINE(struct inpcbinfo, tcbinfo);
 
-static void     tcp_dooptions(struct tcpopt *, u_char *, int, int);
-static void     tcp_do_segment(struct mbuf *, struct tcphdr *,
-                    struct socket *, struct tcpcb *, int, int, uint8_t,
-                    int);
-static void     tcp_dropwithreset(struct mbuf *, struct tcphdr *,
-                    struct tcpcb *, int, int);
-static void     tcp_pulloutofband(struct socket *,
-                    struct tcphdr *, struct mbuf *, int);
-static void     tcp_xmit_timer(struct tcpcb *, int);
-static void     tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
-static void inline     cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
-                           uint16_t type);
-static void inline     cc_conn_init(struct tcpcb *tp);
-static void inline     cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
-static void inline     hhook_run_tcp_est_in(struct tcpcb *tp,
-                           struct tcphdr *th, struct tcpopt *to);
-
 /*
  * TCP statistics are stored in an "array" of counter(9)s.
  */
@@ -272,7 +255,7 @@ kmod_tcpstat_inc(int statnum)
 /*
  * Wrapper for the TCP established input helper hook.
  */
-static void inline
+void
 hhook_run_tcp_est_in(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to)
 {
        struct tcp_hhook_data hhook_data;
@@ -290,7 +273,7 @@ hhook_run_tcp_est_in(struct tcpcb *tp, s
 /*
  * CC wrapper hook functions
  */
-static void inline
+void
 cc_ack_received(struct tcpcb *tp, struct tcphdr *th, uint16_t type)
 {
        INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -322,7 +305,7 @@ cc_ack_received(struct tcpcb *tp, struct
        }
 }
 
-static void inline
+void 
 cc_conn_init(struct tcpcb *tp)
 {
        struct hc_metrics_lite metrics;
@@ -446,7 +429,7 @@ cc_cong_signal(struct tcpcb *tp, struct 
        }
 }
 
-static void inline
+void inline
 cc_post_recovery(struct tcpcb *tp, struct tcphdr *th)
 {
        INP_WLOCK_ASSERT(tp->t_inpcb);
@@ -601,9 +584,6 @@ tcp_input(struct mbuf **mp, int *offp, i
        struct tcpopt to;               /* options in this segment */
        char *s = NULL;                 /* address and port logging */
        int ti_locked;
-#define        TI_UNLOCKED     1
-#define        TI_RLOCKED      2
-
 #ifdef TCPDEBUG
        /*
         * The size of tcp_saveipgen must be the size of the max ip header,
@@ -1175,7 +1155,7 @@ relocked:
                         * contains.  tcp_do_segment() consumes
                         * the mbuf chain and unlocks the inpcb.
                         */
-                       tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen,
+                       tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, 
drop_hdrlen, tlen,
                            iptos, ti_locked);
                        INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
                        return (IPPROTO_DONE);
@@ -1421,7 +1401,7 @@ relocked:
         * state.  tcp_do_segment() always consumes the mbuf chain, unlocks
         * the inpcb, and unlocks pcbinfo.
         */
-       tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked);
+       tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, 
ti_locked);
        INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
        return (IPPROTO_DONE);
 
@@ -1476,7 +1456,7 @@ drop:
        return (IPPROTO_DONE);
 }
 
-static void
+void
 tcp_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so,
     struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos,
     int ti_locked)
@@ -1788,7 +1768,7 @@ tcp_do_segment(struct mbuf *m, struct tc
                                                      tp->t_rxtcur);
                                sowwakeup(so);
                                if (sbavail(&so->so_snd))
-                                       (void) tcp_output(tp);
+                                       (void) tp->t_fb->tfb_tcp_output(tp);
                                goto check_delack;
                        }
                } else if (th->th_ack == tp->snd_una &&
@@ -1907,7 +1887,7 @@ tcp_do_segment(struct mbuf *m, struct tc
                                tp->t_flags |= TF_DELACK;
                        } else {
                                tp->t_flags |= TF_ACKNOW;
-                               tcp_output(tp);
+                               tp->t_fb->tfb_tcp_output(tp);
                        }
                        goto check_delack;
                }
@@ -2522,7 +2502,7 @@ tcp_do_segment(struct mbuf *m, struct tc
                                                }
                                        } else
                                                tp->snd_cwnd += tp->t_maxseg;
-                                       (void) tcp_output(tp);
+                                       (void) tp->t_fb->tfb_tcp_output(tp);
                                        goto drop;
                                } else if (tp->t_dupacks == tcprexmtthresh) {
                                        tcp_seq onxt = tp->snd_nxt;
@@ -2556,12 +2536,12 @@ tcp_do_segment(struct mbuf *m, struct tc
                                                    tcps_sack_recovery_episode);
                                                tp->sack_newdata = tp->snd_nxt;
                                                tp->snd_cwnd = tp->t_maxseg;
-                                               (void) tcp_output(tp);
+                                               (void) 
tp->t_fb->tfb_tcp_output(tp);
                                                goto drop;
                                        }
                                        tp->snd_nxt = th->th_ack;
                                        tp->snd_cwnd = tp->t_maxseg;
-                                       (void) tcp_output(tp);
+                                       (void) tp->t_fb->tfb_tcp_output(tp);
                                        KASSERT(tp->snd_limited <= 2,
                                            ("%s: tp->snd_limited too big",
                                            __func__));
@@ -2608,7 +2588,7 @@ tcp_do_segment(struct mbuf *m, struct tc
                                            (tp->snd_nxt - tp->snd_una);
                                        SOCKBUF_UNLOCK(&so->so_snd);
                                        if (avail > 0)
-                                               (void) tcp_output(tp);
+                                               (void) 
tp->t_fb->tfb_tcp_output(tp);
                                        sent = tp->snd_max - oldsndmax;
                                        if (sent > tp->t_maxseg) {
                                                KASSERT((tp->t_dupacks == 2 &&
@@ -3074,7 +3054,7 @@ dodata:                                                   
/* XXX */
         * Return any desired output.
         */
        if (needoutput || (tp->t_flags & TF_ACKNOW))
-               (void) tcp_output(tp);
+               (void) tp->t_fb->tfb_tcp_output(tp);
 
 check_delack:
        KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
@@ -3122,7 +3102,7 @@ dropafterack:
        ti_locked = TI_UNLOCKED;
 
        tp->t_flags |= TF_ACKNOW;
-       (void) tcp_output(tp);
+       (void) tp->t_fb->tfb_tcp_output(tp);
        INP_WUNLOCK(tp->t_inpcb);
        m_freem(m);
        return;
@@ -3168,7 +3148,7 @@ drop:
  * The mbuf must still include the original packet header.
  * tp may be NULL.
  */
-static void
+void
 tcp_dropwithreset(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp,
     int tlen, int rstreason)
 {
@@ -3231,7 +3211,7 @@ drop:
 /*
  * Parse TCP options and place in tcpopt.
  */
-static void
+void
 tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
 {
        int opt, optlen;
@@ -3325,7 +3305,7 @@ tcp_dooptions(struct tcpopt *to, u_char 
  * It is still reflected in the segment length for
  * sequencing purposes.
  */
-static void
+void
 tcp_pulloutofband(struct socket *so, struct tcphdr *th, struct mbuf *m,
     int off)
 {
@@ -3358,7 +3338,7 @@ tcp_pulloutofband(struct socket *so, str
  * Collect new round-trip time estimate
  * and update averages and current timeout.
  */
-static void
+void
 tcp_xmit_timer(struct tcpcb *tp, int rtt)
 {
        int delta;
@@ -3738,7 +3718,7 @@ tcp_mssopt(struct in_conninfo *inc)
  * By setting snd_nxt to ti_ack, this forces retransmission timer to
  * be started again.
  */
-static void
+void
 tcp_newreno_partial_ack(struct tcpcb *tp, struct tcphdr *th)
 {
        tcp_seq onxt = tp->snd_nxt;
@@ -3755,7 +3735,7 @@ tcp_newreno_partial_ack(struct tcpcb *tp
         */
        tp->snd_cwnd = tp->t_maxseg + BYTES_THIS_ACK(tp, th);
        tp->t_flags |= TF_ACKNOW;
-       (void) tcp_output(tp);
+       (void) tp->t_fb->tfb_tcp_output(tp);
        tp->snd_cwnd = ocwnd;
        if (SEQ_GT(onxt, tp->snd_nxt))
                tp->snd_nxt = onxt;

Modified: head/sys/netinet/tcp_sack.c
==============================================================================
--- head/sys/netinet/tcp_sack.c Wed Dec 16 00:56:38 2015        (r292308)
+++ head/sys/netinet/tcp_sack.c Wed Dec 16 00:56:45 2015        (r292309)
@@ -599,7 +599,7 @@ tcp_sack_partialack(struct tcpcb *tp, st
        if (tp->snd_cwnd > tp->snd_ssthresh)
                tp->snd_cwnd = tp->snd_ssthresh;
        tp->t_flags |= TF_ACKNOW;
-       (void) tcp_output(tp);
+       (void) tp->t_fb->tfb_tcp_output(tp);
 }
 
 #if 0

Added: head/sys/netinet/tcp_stacks/fastpath.c
==============================================================================
--- /dev/null   00:00:00 1970   (empty, because file is newly added)
+++ head/sys/netinet/tcp_stacks/fastpath.c      Wed Dec 16 00:56:45 2015        
(r292309)
@@ -0,0 +1,2486 @@
+/*-
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995
+ *     The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2007-2008,2010
+ *     Swinburne University of Technology, Melbourne, Australia.
+ * Copyright (c) 2009-2010 Lawrence Stewart <lstew...@freebsd.org>
+ * Copyright (c) 2010 The FreeBSD Foundation
+ * Copyright (c) 2010-2011 Juniper Networks, Inc.
+ * Copyright (c) 2015 Netflix Inc.
+ * All rights reserved.
+ *
+ * Portions of this software were developed at the Centre for Advanced Internet
+ * Architectures, Swinburne University of Technology, by Lawrence Stewart,
+ * James Healy and David Hayes, made possible in part by a grant from the Cisco
+ * University Research Program Fund at Community Foundation Silicon Valley.
+ *
+ * Portions of this software were developed at the Centre for Advanced
+ * Internet Architectures, Swinburne University of Technology, Melbourne,
+ * Australia by David Hayes under sponsorship from the FreeBSD Foundation.
+ *
+ * Portions of this software were developed by Robert N. M. Watson under
+ * contract to Juniper Networks, Inc.
+ *
+ * Portions of this software were developed by Randall R. Stewart while
+ * working for Netflix Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *     @(#)tcp_input.c 8.12 (Berkeley) 5/24/95
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include "opt_ipfw.h"          /* for ipfw_fwd */
+#include "opt_inet.h"
+#include "opt_inet6.h"
+#include "opt_ipsec.h"
+#include "opt_kdtrace.h"
+#include "opt_tcpdebug.h"
+
+#include <sys/param.h>
+#include <sys/module.h>
+#include <sys/kernel.h>
+#include <sys/hhook.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/proc.h>          /* for proc0 declaration */
+#include <sys/protosw.h>
+#include <sys/sdt.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/sysctl.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+
+#include <machine/cpu.h>       /* before tcp_seq.h, for tcp_random18() */
+
+#include <vm/uma.h>
+
+#include <net/if.h>
+#include <net/if_var.h>
+#include <net/route.h>
+#include <net/vnet.h>
+
+#define TCPSTATES              /* for logging */
+
+#include <netinet/cc.h>
+#include <netinet/in.h>
+#include <netinet/in_kdtrace.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_systm.h>
+#include <netinet/in_var.h>
+#include <netinet/ip.h>
+#include <netinet/ip_icmp.h>   /* required for icmp_var.h */
+#include <netinet/icmp_var.h>  /* for ICMP_BANDLIM */
+#include <netinet/ip_var.h>
+#include <netinet/ip_options.h>
+#include <netinet/ip6.h>
+#include <netinet/icmp6.h>
+#include <netinet6/in6_pcb.h>
+#include <netinet6/ip6_var.h>
+#include <netinet6/nd6.h>
+#include <netinet/tcp_fsm.h>
+#include <netinet/tcp_seq.h>
+#include <netinet/tcp_timer.h>
+#include <netinet/tcp_var.h>
+#include <netinet6/tcp6_var.h>
+#include <netinet/tcpip.h>
+#include <netinet/tcp_syncache.h>
+#ifdef TCPDEBUG
+#include <netinet/tcp_debug.h>
+#endif /* TCPDEBUG */
+#ifdef TCP_OFFLOAD
+#include <netinet/tcp_offload.h>
+#endif
+
+#ifdef IPSEC
+#include <netipsec/ipsec.h>
+#include <netipsec/ipsec6.h>
+#endif /*IPSEC*/
+
+#include <machine/in_cksum.h>
+
+#include <security/mac/mac_framework.h>
+
+const int tcprexmtthresh;
+
+VNET_DECLARE(int, tcp_autorcvbuf_inc);
+#define        V_tcp_autorcvbuf_inc    VNET(tcp_autorcvbuf_inc)
+VNET_DECLARE(int, tcp_autorcvbuf_max);
+#define        V_tcp_autorcvbuf_max    VNET(tcp_autorcvbuf_max)
+VNET_DECLARE(int, tcp_do_rfc3042);
+#define        V_tcp_do_rfc3042        VNET(tcp_do_rfc3042)
+VNET_DECLARE(int, tcp_do_autorcvbuf);
+#define        V_tcp_do_autorcvbuf     VNET(tcp_do_autorcvbuf)
+VNET_DECLARE(int, tcp_insecure_rst);
+#define        V_tcp_insecure_rst      VNET(tcp_insecure_rst)
+VNET_DECLARE(int, tcp_insecure_syn);
+#define        V_tcp_insecure_syn      VNET(tcp_insecure_syn)
+
+
+
+
+extern void    tcp_dooptions(struct tcpopt *, u_char *, int, int);
+extern void    tcp_dropwithreset(struct mbuf *, struct tcphdr *,
+                    struct tcpcb *, int, int);
+extern void    tcp_pulloutofband(struct socket *,
+                    struct tcphdr *, struct mbuf *, int);
+extern void    tcp_xmit_timer(struct tcpcb *, int);
+extern void    tcp_newreno_partial_ack(struct tcpcb *, struct tcphdr *);
+extern void    tcp_mss(struct tcpcb *tp, int offer);
+extern void    cc_ack_received(struct tcpcb *tp, struct tcphdr *th,
+                               uint16_t type);
+extern void cc_conn_init(struct tcpcb *tp);
+extern void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th);
+extern void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type);
+extern void hhook_run_tcp_est_in(struct tcpcb *tp,
+                                struct tcphdr *th, struct tcpopt *to);
+
+extern void kmod_tcpstat_inc(int statnum);
+#ifdef TCP_SIGNATURE
+extern int tcp_signature_verify_input(struct mbuf *m, int off0, int tlen, int 
optlen,
+            struct tcpopt *to, struct tcphdr *th, u_int tcpbflag);
+#endif
+
+static void     tcp_do_segment_fastslow(struct mbuf *, struct tcphdr *,
+                       struct socket *, struct tcpcb *, int, int, uint8_t,
+                       int);
+
+static void     tcp_do_segment_fastack(struct mbuf *, struct tcphdr *,
+                       struct socket *, struct tcpcb *, int, int, uint8_t,
+                       int);
+
+/*
+ * Indicate whether this ack should be delayed.  We can delay the ack if
+ * following conditions are met:
+ *     - There is no delayed ack timer in progress.
+ *     - Our last ack wasn't a 0-sized window. We never want to delay
+ *       the ack that opens up a 0-sized window.
+ *     - LRO wasn't used for this segment. We make sure by checking that the
+ *       segment size is not larger than the MSS.
+ *     - Delayed acks are enabled or this is a half-synchronized T/TCP
+ *       connection.
+ */
+#define DELAY_ACK(tp, tlen)                                            \
+       ((!tcp_timer_active(tp, TT_DELACK) &&                           \
+           (tp->t_flags & TF_RXWIN0SENT) == 0) &&                      \
+           (tlen <= tp->t_maxopd) &&                                   \
+           (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN)))
+
+/*
+ * So how is this faster than the normal fast ack?
+ * It basically allows us to also stay in the fastpath
+ * when a window-update ack also arrives. In testing
+ * we saw only 25-30% of connections doing fastpath 
+ * due to the fact that along with moving forward
+ * in sequence the window was also updated.
+ */
+static void
+tcp_do_fastack(struct mbuf *m, struct tcphdr *th, struct socket *so,
+              struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 
+              int ti_locked, u_long tiwin)
+{
+       int acked;
+       int winup_only=0;
+#ifdef TCPDEBUG
+       /*
+        * The size of tcp_saveipgen must be the size of the max ip header,
+        * now IPv6.
+        */
+       u_char tcp_saveipgen[IP6_HDR_LEN];
+       struct tcphdr tcp_savetcp;
+       short ostate = 0;
+#endif
+        /*
+        * The following if statment will be true if
+        * we are doing the win_up_in_fp <and>
+        * - We have more new data (SEQ_LT(tp->snd_wl1, th->th_seq)) <or>
+        * - No more new data, but we have an ack for new data
+        *   (tp->snd_wl1 == th->th_seq && SEQ_LT(tp->snd_wl2, th->th_ack))
+        * - No more new data, the same ack point but the window grew
+        *   (tp->snd_wl1 == th->th_seq && tp->snd_wl2 == th->th_ack && twin > 
tp->snd_wnd)
+        */
+       if ((SEQ_LT(tp->snd_wl1, th->th_seq) ||
+            (tp->snd_wl1 == th->th_seq && (SEQ_LT(tp->snd_wl2, th->th_ack) ||
+                                           (tp->snd_wl2 == th->th_ack && tiwin 
> tp->snd_wnd))))) {
+               /* keep track of pure window updates */
+               if (tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) {
+                       winup_only = 1;
+                       TCPSTAT_INC(tcps_rcvwinupd);
+               }
+               tp->snd_wnd = tiwin;
+               tp->snd_wl1 = th->th_seq;
+               tp->snd_wl2 = th->th_ack;
+               if (tp->snd_wnd > tp->max_sndwnd)
+                       tp->max_sndwnd = tp->snd_wnd;
+       }
+       /*
+        * If last ACK falls within this segment's sequence numbers,
+        * record the timestamp.
+        * NOTE that the test is modified according to the latest
+        * proposal of the tc...@cray.com list (Braden 1993/04/26).
+        */
+       if ((to->to_flags & TOF_TS) != 0 &&
+           SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+               tp->ts_recent_age = tcp_ts_getticks();
+               tp->ts_recent = to->to_tsval;
+       }
+       /*
+        * This is a pure ack for outstanding data.
+        */
+       if (ti_locked == TI_RLOCKED) {
+               INP_INFO_RUNLOCK(&V_tcbinfo);
+       }
+       ti_locked = TI_UNLOCKED;
+
+       TCPSTAT_INC(tcps_predack);
+
+       /*
+        * "bad retransmit" recovery.
+        */
+       if (tp->t_rxtshift == 1 &&
+           tp->t_flags & TF_PREVVALID &&
+           (int)(ticks - tp->t_badrxtwin) < 0) {
+               cc_cong_signal(tp, th, CC_RTO_ERR);
+       }
+
+       /*
+        * Recalculate the transmit timer / rtt.
+        *
+        * Some boxes send broken timestamp replies
+        * during the SYN+ACK phase, ignore
+        * timestamps of 0 or we could calculate a
+        * huge RTT and blow up the retransmit timer.
+        */
+       if ((to->to_flags & TOF_TS) != 0 &&
+           to->to_tsecr) {
+               u_int t;
+
+               t = tcp_ts_getticks() - to->to_tsecr;
+               if (!tp->t_rttlow || tp->t_rttlow > t)
+                       tp->t_rttlow = t;
+               tcp_xmit_timer(tp,
+                              TCP_TS_TO_TICKS(t) + 1);
+       } else if (tp->t_rtttime &&
+                  SEQ_GT(th->th_ack, tp->t_rtseq)) {
+               if (!tp->t_rttlow ||
+                   tp->t_rttlow > ticks - tp->t_rtttime)
+                       tp->t_rttlow = ticks - tp->t_rtttime;
+               tcp_xmit_timer(tp,
+                              ticks - tp->t_rtttime);
+       }
+       if (winup_only == 0) {
+               acked = BYTES_THIS_ACK(tp, th);
+
+               /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */
+               hhook_run_tcp_est_in(tp, th, to);
+
+               TCPSTAT_ADD(tcps_rcvackbyte, acked);
+               sbdrop(&so->so_snd, acked);
+               if (SEQ_GT(tp->snd_una, tp->snd_recover) &&
+                   SEQ_LEQ(th->th_ack, tp->snd_recover))
+                       tp->snd_recover = th->th_ack - 1;
+                               
+               /*
+                * Let the congestion control algorithm update
+                * congestion control related information. This
+                * typically means increasing the congestion
+                * window.
+                */
+               cc_ack_received(tp, th, CC_ACK);
+
+               tp->snd_una = th->th_ack;
+               /*
+                * Pull snd_wl2 up to prevent seq wrap relative
+                * to th_ack.
+                */
+               tp->snd_wl2 = th->th_ack;
+               tp->t_dupacks = 0;
+               m_freem(m);
+
+               /*
+                * If all outstanding data are acked, stop
+                * retransmit timer, otherwise restart timer
+                * using current (possibly backed-off) value.
+                * If process is waiting for space,
+                * wakeup/selwakeup/signal.  If data
+                * are ready to send, let tcp_output
+                * decide between more output or persist.
+                */
+#ifdef TCPDEBUG
+               if (so->so_options & SO_DEBUG)
+                       tcp_trace(TA_INPUT, ostate, tp,
+                                 (void *)tcp_saveipgen,
+                                 &tcp_savetcp, 0);
+#endif
+               if (tp->snd_una == tp->snd_max)
+                       tcp_timer_activate(tp, TT_REXMT, 0);
+               else if (!tcp_timer_active(tp, TT_PERSIST))
+                       tcp_timer_activate(tp, TT_REXMT,
+                                          tp->t_rxtcur);
+       } else {
+               /* 
+                * Window update only, just free the mbufs and
+                * send out whatever we can.
+                */
+               m_freem(m);
+       }
+       sowwakeup(so);
+       if (sbavail(&so->so_snd))
+               (void) tcp_output(tp);
+       KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+                                           __func__, ti_locked));
+       INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+       INP_WLOCK_ASSERT(tp->t_inpcb);
+
+       if (tp->t_flags & TF_DELACK) {
+               tp->t_flags &= ~TF_DELACK;
+               tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+       }
+       INP_WUNLOCK(tp->t_inpcb);
+}
+
+/*
+ * Here nothing is really faster, its just that we
+ * have broken out the fast-data path also just like
+ * the fast-ack. 
+ */
+static void
+tcp_do_fastnewdata(struct mbuf *m, struct tcphdr *th, struct socket *so,
+                  struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int 
tlen, 
+                  int ti_locked, u_long tiwin)
+{
+       int newsize = 0;        /* automatic sockbuf scaling */
+#ifdef TCPDEBUG
+       /*
+        * The size of tcp_saveipgen must be the size of the max ip header,
+        * now IPv6.
+        */
+       u_char tcp_saveipgen[IP6_HDR_LEN];
+       struct tcphdr tcp_savetcp;
+       short ostate = 0;
+#endif
+       /*
+        * If last ACK falls within this segment's sequence numbers,
+        * record the timestamp.
+        * NOTE that the test is modified according to the latest
+        * proposal of the tc...@cray.com list (Braden 1993/04/26).
+        */
+       if ((to->to_flags & TOF_TS) != 0 &&
+           SEQ_LEQ(th->th_seq, tp->last_ack_sent)) {
+               tp->ts_recent_age = tcp_ts_getticks();
+               tp->ts_recent = to->to_tsval;
+       }
+
+       /*
+        * This is a pure, in-sequence data packet with
+        * nothing on the reassembly queue and we have enough
+        * buffer space to take it.
+        */
+       if (ti_locked == TI_RLOCKED) {
+               INP_INFO_RUNLOCK(&V_tcbinfo);
+       }
+       ti_locked = TI_UNLOCKED;
+
+       /* Clean receiver SACK report if present */
+       if ((tp->t_flags & TF_SACK_PERMIT) && tp->rcv_numsacks)
+               tcp_clean_sackreport(tp);
+       TCPSTAT_INC(tcps_preddat);
+       tp->rcv_nxt += tlen;
+       /*
+        * Pull snd_wl1 up to prevent seq wrap relative to
+        * th_seq.
+        */
+       tp->snd_wl1 = th->th_seq;
+       /*
+        * Pull rcv_up up to prevent seq wrap relative to
+        * rcv_nxt.
+        */
+       tp->rcv_up = tp->rcv_nxt;
+       TCPSTAT_ADD(tcps_rcvbyte, tlen);
+#ifdef TCPDEBUG
+       if (so->so_options & SO_DEBUG)
+               tcp_trace(TA_INPUT, ostate, tp,
+                         (void *)tcp_saveipgen, &tcp_savetcp, 0);
+#endif
+       /*
+        * Automatic sizing of receive socket buffer.  Often the send
+        * buffer size is not optimally adjusted to the actual network
+        * conditions at hand (delay bandwidth product).  Setting the
+        * buffer size too small limits throughput on links with high
+        * bandwidth and high delay (eg. trans-continental/oceanic links).
+        *
+        * On the receive side the socket buffer memory is only rarely
+        * used to any significant extent.  This allows us to be much
+        * more aggressive in scaling the receive socket buffer.  For
+        * the case that the buffer space is actually used to a large
+        * extent and we run out of kernel memory we can simply drop
+        * the new segments; TCP on the sender will just retransmit it
+        * later.  Setting the buffer size too big may only consume too
+        * much kernel memory if the application doesn't read() from
+        * the socket or packet loss or reordering makes use of the
+        * reassembly queue.
+        *
+        * The criteria to step up the receive buffer one notch are:
+        *  1. Application has not set receive buffer size with
+        *     SO_RCVBUF. Setting SO_RCVBUF clears SB_AUTOSIZE.
+        *  2. the number of bytes received during the time it takes
+        *     one timestamp to be reflected back to us (the RTT);
+        *  3. received bytes per RTT is within seven eighth of the
+        *     current socket buffer size;
+        *  4. receive buffer size has not hit maximal automatic size;
+        *
+        * This algorithm does one step per RTT at most and only if
+        * we receive a bulk stream w/o packet losses or reorderings.
+        * Shrinking the buffer during idle times is not necessary as
+        * it doesn't consume any memory when idle.
+        *
+        * TODO: Only step up if the application is actually serving
+        * the buffer to better manage the socket buffer resources.
+        */
+       if (V_tcp_do_autorcvbuf &&
+           (to->to_flags & TOF_TS) &&
+           to->to_tsecr &&
+           (so->so_rcv.sb_flags & SB_AUTOSIZE)) {
+               if (TSTMP_GT(to->to_tsecr, tp->rfbuf_ts) &&
+                   to->to_tsecr - tp->rfbuf_ts < hz) {
+                       if (tp->rfbuf_cnt >
+                           (so->so_rcv.sb_hiwat / 8 * 7) &&
+                           so->so_rcv.sb_hiwat <
+                           V_tcp_autorcvbuf_max) {
+                               newsize =
+                                       min(so->so_rcv.sb_hiwat +
+                                           V_tcp_autorcvbuf_inc,
+                                           V_tcp_autorcvbuf_max);
+                       }
+                       /* Start over with next RTT. */
+                       tp->rfbuf_ts = 0;
+                       tp->rfbuf_cnt = 0;
+               } else
+                       tp->rfbuf_cnt += tlen;  /* add up */
+       }
+
+       /* Add data to socket buffer. */
+       SOCKBUF_LOCK(&so->so_rcv);
+       if (so->so_rcv.sb_state & SBS_CANTRCVMORE) {
+               m_freem(m);
+       } else {
+               /*
+                * Set new socket buffer size.
+                * Give up when limit is reached.
+                */
+               if (newsize)
+                       if (!sbreserve_locked(&so->so_rcv,
+                                             newsize, so, NULL))
+                               so->so_rcv.sb_flags &= ~SB_AUTOSIZE;
+               m_adj(m, drop_hdrlen);  /* delayed header drop */
+               sbappendstream_locked(&so->so_rcv, m, 0);
+       }
+       /* NB: sorwakeup_locked() does an implicit unlock. */
+       sorwakeup_locked(so);
+       if (DELAY_ACK(tp, tlen)) {
+               tp->t_flags |= TF_DELACK;
+       } else {
+               tp->t_flags |= TF_ACKNOW;
+               tcp_output(tp);
+       }
+       KASSERT(ti_locked == TI_UNLOCKED, ("%s: check_delack ti_locked %d",
+                                           __func__, ti_locked));
+       INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
+       INP_WLOCK_ASSERT(tp->t_inpcb);
+
+       if (tp->t_flags & TF_DELACK) {
+               tp->t_flags &= ~TF_DELACK;
+               tcp_timer_activate(tp, TT_DELACK, tcp_delacktime);
+       }
+       INP_WUNLOCK(tp->t_inpcb);
+}
+
+/*
+ * The slow-path is the clone of the long long part
+ * of tcp_do_segment past all the fast-path stuff. We
+ * use it here by two different callers, the fast/slow and
+ * the fastack only.
+ */
+static void
+tcp_do_slowpath(struct mbuf *m, struct tcphdr *th, struct socket *so,
+               struct tcpcb *tp, struct tcpopt *to, int drop_hdrlen, int tlen, 
+               int ti_locked, u_long tiwin, int thflags)
+{
+       int  acked, ourfinisacked, needoutput = 0;
+       int rstreason, todrop, win;
+       char *s;
+       struct in_conninfo *inc;
+       struct mbuf *mfree = NULL;
+#ifdef TCPDEBUG
+       /*
+        * The size of tcp_saveipgen must be the size of the max ip header,
+        * now IPv6.
+        */
+       u_char tcp_saveipgen[IP6_HDR_LEN];
+       struct tcphdr tcp_savetcp;
+       short ostate = 0;
+#endif
+       /*
+        * Calculate amount of space in receive window,
+        * and then do TCP input processing.
+        * Receive window is amount of space in rcv queue,
+        * but not less than advertised window.
+        */
+       inc = &tp->t_inpcb->inp_inc;
+       win = sbspace(&so->so_rcv);
+       if (win < 0)
+               win = 0;
+       tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
+
+       /* Reset receive buffer auto scaling when not in bulk receive mode. */
+       tp->rfbuf_ts = 0;
+       tp->rfbuf_cnt = 0;
+
+       switch (tp->t_state) {
+
+       /*
+        * If the state is SYN_RECEIVED:
+        *      if seg contains an ACK, but not for our SYN/ACK, send a RST.
+        */
+       case TCPS_SYN_RECEIVED:
+               if ((thflags & TH_ACK) &&
+                   (SEQ_LEQ(th->th_ack, tp->snd_una) ||
+                    SEQ_GT(th->th_ack, tp->snd_max))) {
+                               rstreason = BANDLIM_RST_OPENPORT;
+                               goto dropwithreset;
+               }
+               break;
+
+       /*
+        * If the state is SYN_SENT:
+        *      if seg contains an ACK, but not for our SYN, drop the input.
+        *      if seg contains a RST, then drop the connection.
+        *      if seg does not contain SYN, then drop it.
+        * Otherwise this is an acceptable SYN segment
+        *      initialize tp->rcv_nxt and tp->irs
+        *      if seg contains ack then advance tp->snd_una
+        *      if seg contains an ECE and ECN support is enabled, the stream
+        *          is ECN capable.
+        *      if SYN has been acked change to ESTABLISHED else SYN_RCVD state
+        *      arrange for segment to be acked (eventually)
+        *      continue processing rest of data/controls, beginning with URG
+        */
+       case TCPS_SYN_SENT:
+               if ((thflags & TH_ACK) &&
+                   (SEQ_LEQ(th->th_ack, tp->iss) ||
+                    SEQ_GT(th->th_ack, tp->snd_max))) {
+                       rstreason = BANDLIM_UNLIMITED;
+                       goto dropwithreset;
+               }
+               if ((thflags & (TH_ACK|TH_RST)) == (TH_ACK|TH_RST)) {
+                       TCP_PROBE5(connect__refused, NULL, tp,
+                           mtod(m, const char *), tp, th);
+                       tp = tcp_drop(tp, ECONNREFUSED);
+               }
+               if (thflags & TH_RST)
+                       goto drop;
+               if (!(thflags & TH_SYN))
+                       goto drop;
+
+               tp->irs = th->th_seq;
+               tcp_rcvseqinit(tp);
+               if (thflags & TH_ACK) {
+                       TCPSTAT_INC(tcps_connects);
+                       soisconnected(so);
+#ifdef MAC
+                       mac_socketpeer_set_from_mbuf(m, so);
+#endif
+                       /* Do window scaling on this connection? */
+                       if ((tp->t_flags & (TF_RCVD_SCALE|TF_REQ_SCALE)) ==
+                               (TF_RCVD_SCALE|TF_REQ_SCALE)) {
+                               tp->rcv_scale = tp->request_r_scale;
+                       }
+                       tp->rcv_adv += imin(tp->rcv_wnd,
+                           TCP_MAXWIN << tp->rcv_scale);
+                       tp->snd_una++;          /* SYN is acked */
+                       /*
+                        * If there's data, delay ACK; if there's also a FIN
+                        * ACKNOW will be turned on later.
+                        */
+                       if (DELAY_ACK(tp, tlen) && tlen != 0)
+                               tcp_timer_activate(tp, TT_DELACK,
+                                   tcp_delacktime);
+                       else
+                               tp->t_flags |= TF_ACKNOW;
+
+                       if ((thflags & TH_ECE) && V_tcp_do_ecn) {
+                               tp->t_flags |= TF_ECN_PERMIT;
+                               TCPSTAT_INC(tcps_ecn_shs);
+                       }
+                       
+                       /*
+                        * Received <SYN,ACK> in SYN_SENT[*] state.
+                        * Transitions:
+                        *      SYN_SENT  --> ESTABLISHED
+                        *      SYN_SENT* --> FIN_WAIT_1
+                        */
+                       tp->t_starttime = ticks;
+                       if (tp->t_flags & TF_NEEDFIN) {
+                               tcp_state_change(tp, TCPS_FIN_WAIT_1);
+                               tp->t_flags &= ~TF_NEEDFIN;
+                               thflags &= ~TH_SYN;
+                       } else {
+                               tcp_state_change(tp, TCPS_ESTABLISHED);
+                               TCP_PROBE5(connect__established, NULL, tp,
+                                   mtod(m, const char *), tp, th);
+                               cc_conn_init(tp);
+                               tcp_timer_activate(tp, TT_KEEP,
+                                   TP_KEEPIDLE(tp));
+                       }
+               } else {
+                       /*
+                        * Received initial SYN in SYN-SENT[*] state =>
+                        * simultaneous open.
+                        * If it succeeds, connection is * half-synchronized.
+                        * Otherwise, do 3-way handshake:
+                        *        SYN-SENT -> SYN-RECEIVED
+                        *        SYN-SENT* -> SYN-RECEIVED*
+                        */
+                       tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN);
+                       tcp_timer_activate(tp, TT_REXMT, 0);
+                       tcp_state_change(tp, TCPS_SYN_RECEIVED);
+               }
+
+               KASSERT(ti_locked == TI_RLOCKED, ("%s: trimthenstep6: "
+                   "ti_locked %d", __func__, ti_locked));
+               INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
+               INP_WLOCK_ASSERT(tp->t_inpcb);
+
+               /*
+                * Advance th->th_seq to correspond to first data byte.
+                * If data, trim to stay within window,
+                * dropping FIN if necessary.
+                */
+               th->th_seq++;
+               if (tlen > tp->rcv_wnd) {
+                       todrop = tlen - tp->rcv_wnd;
+                       m_adj(m, -todrop);
+                       tlen = tp->rcv_wnd;
+                       thflags &= ~TH_FIN;
+                       TCPSTAT_INC(tcps_rcvpackafterwin);
+                       TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
+               }
+               tp->snd_wl1 = th->th_seq - 1;
+               tp->rcv_up = th->th_seq;
+               /*
+                * Client side of transaction: already sent SYN and data.
+                * If the remote host used T/TCP to validate the SYN,
+                * our data will be ACK'd; if so, enter normal data segment
+                * processing in the middle of step 5, ack processing.
+                * Otherwise, goto step 6.
+                */
+               if (thflags & TH_ACK)
+                       goto process_ACK;
+
+               goto step6;
+
+       /*
+        * If the state is LAST_ACK or CLOSING or TIME_WAIT:
+        *      do normal processing.
+        *
+        * NB: Leftover from RFC1644 T/TCP.  Cases to be reused later.
+        */

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-all@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-all
To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"

svn commit: r292309 - in head/sys: modules modules/tcp modules/tcp/fastpath netinet netinet/tcp_stacks

Reply via email to