Hi Daniele, few comments inline. > -----Original Message----- > From: Daniele Di Proietto [mailto:diproiet...@vmware.com] > Sent: Wednesday, April 27, 2016 7:36 AM > To: Fischetti, Antonio <antonio.fische...@intel.com> > Cc: dev@openvswitch.org > Subject: Re: [ovs-dev] [PATCH v2 04/15] conntrack: New userspace > connection tracker. > > Hi Antonio, > > Thank you for your comments, replies inline > > > > > On 25/04/2016 09:14, "Fischetti, Antonio" > <antonio.fische...@intel.com> wrote: > > >Hi Daniele, > >some comments inline prefixed with [Antonio F]. > > > >Regards, > >Antonio > > > >> -----Original Message----- > >> From: dev [mailto:dev-boun...@openvswitch.org] On Behalf Of > Daniele Di > >> Proietto > >> Sent: Saturday, April 16, 2016 1:03 AM > >> To: dev@openvswitch.org > >> Subject: [ovs-dev] [PATCH v2 04/15] conntrack: New userspace > connection > >> tracker. > >> > >> This commit adds the conntrack module. > >> > >> It is a connection tracker that resides entirely in userspace. > Its > >> primary user will be the dpif-netdev datapath. > >> > >> The module main goal is to provide conntrack_execute(), which > offers a > >> convenient interface to implement the datapath ct() action. > >> > >> The conntrack module uses two submodules to deal with the l4 > protocol > >> details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP). > >> > >> The conntrack-tcp submodule implementation is adapted from > FreeBSD's pf > >> subsystem, therefore it's BSD licensed. It has been slightly > altered to > >> match the OVS coding style and to allow the pickup of already > >> established connections. > >> > >> Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com> > >> --- > >> COPYING | 1 + > >> debian/copyright.in | 4 + > >> lib/automake.mk | 5 + > >> lib/conntrack-other.c | 91 ++++++ > >> lib/conntrack-private.h | 77 +++++ > >> lib/conntrack-tcp.c | 476 +++++++++++++++++++++++++++ > >> lib/conntrack.c | 851 > >> ++++++++++++++++++++++++++++++++++++++++++++++++ > >> lib/conntrack.h | 144 ++++++++ > >> 8 files changed, 1649 insertions(+) > >> create mode 100644 lib/conntrack-other.c > >> create mode 100644 lib/conntrack-private.h > >> create mode 100644 lib/conntrack-tcp.c > >> create mode 100644 lib/conntrack.c > >> create mode 100644 lib/conntrack.h > >> > >> diff --git a/COPYING b/COPYING > >> index 308e3ea..afb98b9 100644 > >> --- a/COPYING > >> +++ b/COPYING > >> @@ -25,6 +25,7 @@ License, version 2. > >> The following files are licensed under the 2-clause BSD license. > >> include/windows/getopt.h > >> lib/getopt_long.c > >> + lib/conntrack-tcp.c > >> > >> The following files are licensed under the 3-clause BSD-license > >> include/windows/netinet/icmp6.h > >> diff --git a/debian/copyright.in b/debian/copyright.in > >> index 57d007a..a15f4dd 100644 > >> --- a/debian/copyright.in > >> +++ b/debian/copyright.in > >> @@ -21,6 +21,9 @@ Upstream Copyright Holders: > >> Copyright (c) 2014 Michael Chapman > >> Copyright (c) 2014 WindRiver, Inc. > >> Copyright (c) 2014 Avaya, Inc. > >> + Copyright (c) 2001 Daniel Hartmeier > >> + Copyright (c) 2002 - 2008 Henning Brauer > >> + Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org> > >> > >> License: > >> > >> @@ -90,6 +93,7 @@ License: > >> lib/getopt_long.c > >> include/windows/getopt.h > >> datapath-windows/ovsext/Conntrack-tcp.c > >> + lib/conntrack-tcp.c > >> > >> * The following files are licensed under the 3-clause BSD-license > >> > >> diff --git a/lib/automake.mk b/lib/automake.mk > >> index 1ec2115..ba30442 100644 > >> --- a/lib/automake.mk > >> +++ b/lib/automake.mk > >> @@ -47,6 +47,11 @@ lib_libopenvswitch_la_SOURCES = \ > >> lib/compiler.h \ > >> lib/connectivity.c \ > >> lib/connectivity.h \ > >> + lib/conntrack-private.h \ > >> + lib/conntrack-tcp.c \ > >> + lib/conntrack-other.c \ > >> + lib/conntrack.c \ > >> + lib/conntrack.h \ > >> lib/coverage.c \ > >> lib/coverage.h \ > >> lib/crc32c.c \ > >> diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c > >> new file mode 100644 > >> index 0000000..65d02a9 > >> --- /dev/null > >> +++ b/lib/conntrack-other.c > >> @@ -0,0 +1,91 @@ > >> +/* > >> + * Copyright (c) 2015, 2016 Nicira, Inc. > >> + * > >> + * Licensed under the Apache License, Version 2.0 (the > "License"); > >> + * you may not use this file except in compliance with the > License. > >> + * You may obtain a copy of the License at: > >> + * > >> + * http://www.apache.org/licenses/LICENSE-2.0 > >> + * > >> + * Unless required by applicable law or agreed to in writing, > software > >> + * distributed under the License is distributed on an "AS IS" > BASIS, > >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express > or > >> implied. > >> + * See the License for the specific language governing > permissions and > >> + * limitations under the License. > >> + */ > >> + > >> +#include <config.h> > >> + > >> +#include "conntrack-private.h" > >> +#include "dp-packet.h" > >> + > >> +enum other_state { > >> + OTHERS_FIRST, > >> + OTHERS_MULTIPLE, > >> + OTHERS_BIDIR, > >> +}; > >> + > >> +struct conn_other { > >> + struct conn up; > >> + enum other_state state; > >> +}; > >> + > >> +static const long long other_timeouts[] = { > >> + [OTHERS_FIRST] = 60 * 1000, > >> + [OTHERS_MULTIPLE] = 60 * 1000, > >> + [OTHERS_BIDIR] = 30 * 1000, > >> +}; > >> + > >> +static struct conn_other * > >> +conn_other_cast(const struct conn *conn) > >> +{ > >> + return CONTAINER_OF(conn, struct conn_other, up); > >> +} > >> + > >> +static void > >> +update_expiration(struct conn_other *conn, long long now) > >> +{ > >> + conn->up.expiration = now + other_timeouts[conn->state]; > >> +} > >> + > >> +static enum ct_update_res > >> +other_conn_update(struct conn *conn_, struct dp_packet *pkt > >> OVS_UNUSED, > >> + bool reply, long long now) > >> +{ > >> + struct conn_other *conn = conn_other_cast(conn_); > >> + > >> + if (reply && conn->state != OTHERS_BIDIR) { > >> + conn->state = OTHERS_BIDIR; > >> + } else if (conn->state == OTHERS_FIRST) { > >> + conn->state = OTHERS_MULTIPLE; > >> + } > >> + > >> + update_expiration(conn, now); > >> + > >> + return CT_UPDATE_VALID; > >> +} > >> + > >> +static bool > >> +other_valid_new(struct dp_packet *pkt OVS_UNUSED) > >> +{ > >> + return true; > >> +} > >> + > >> +static struct conn * > >> +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now) > >> +{ > >> + struct conn_other *conn; > >> + > >> + conn = xzalloc(sizeof(struct conn_other)); > >> + conn->state = OTHERS_FIRST; > >> + > >> + update_expiration(conn, now); > >> + > >> + return &conn->up; > >> +} > >> + > >> +struct ct_l4_proto ct_proto_other = { > >> + .new_conn = other_new_conn, > >> + .valid_new = other_valid_new, > >> + .conn_update = other_conn_update, > >> +}; > >> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h > >> new file mode 100644 > >> index 0000000..e668c44 > >> --- /dev/null > >> +++ b/lib/conntrack-private.h > >> @@ -0,0 +1,77 @@ > >> +/* > >> + * Copyright (c) 2015, 2016 Nicira, Inc. > >> + * > >> + * Licensed under the Apache License, Version 2.0 (the > "License"); > >> + * you may not use this file except in compliance with the > License. > >> + * You may obtain a copy of the License at: > >> + * > >> + * http://www.apache.org/licenses/LICENSE-2.0 > >> + * > >> + * Unless required by applicable law or agreed to in writing, > software > >> + * distributed under the License is distributed on an "AS IS" > BASIS, > >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express > or > >> implied. > >> + * See the License for the specific language governing > permissions and > >> + * limitations under the License. > >> + */ > >> + > >> +#ifndef CONNTRACK_PRIVATE_H > >> +#define CONNTRACK_PRIVATE_H 1 > >> + > >> +#include <sys/types.h> > >> +#include <netinet/in.h> > >> +#include <netinet/ip6.h> > >> + > >> +#include "hmap.h" > >> +#include "openvswitch/types.h" > >> +#include "packets.h" > >> +#include "unaligned.h" > >> + > >> +struct ct_addr { > >> + union { > >> + ovs_16aligned_be32 ipv4; > >> + union ovs_16aligned_in6_addr ipv6; > >> + ovs_be32 ipv4_aligned; > >> + struct in6_addr ipv6_aligned; > >> + }; > >> +}; > >> + > >> +struct ct_endpoint { > >> + struct ct_addr addr; > >> + ovs_be16 port; > >> +}; > >> + > >> +struct conn_key { > >> + struct ct_endpoint src; > >> + struct ct_endpoint dst; > >> + > >> + ovs_be16 dl_type; > >> + uint8_t nw_proto; > >> + uint16_t zone; > >> +}; > >> + > >> +struct conn { > >> + struct conn_key key; > >> + struct conn_key rev_key; > >> + long long expiration; > >> + struct hmap_node node; > >> + uint32_t mark; > >> + ovs_u128 label; > >> +}; > >> + > >> +enum ct_update_res { > >> + CT_UPDATE_INVALID, > >> + CT_UPDATE_VALID, > >> + CT_UPDATE_NEW, > >> +}; > >> + > >> +struct ct_l4_proto { > >> + struct conn *(*new_conn)(struct dp_packet *pkt, long long > now); > >> + bool (*valid_new)(struct dp_packet *pkt); > >> + enum ct_update_res (*conn_update)(struct conn *conn, struct > dp_packet > >> *pkt, > >> + bool reply, long long now); > >> +}; > >> + > >> +extern struct ct_l4_proto ct_proto_tcp; > >> +extern struct ct_l4_proto ct_proto_other; > >> + > >> +#endif /* conntrack-private.h */ > >> diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c > >> new file mode 100644 > >> index 0000000..4d80038 > >> --- /dev/null > >> +++ b/lib/conntrack-tcp.c > >> @@ -0,0 +1,476 @@ > >> +/*- > >> + * Copyright (c) 2001 Daniel Hartmeier > >> + * Copyright (c) 2002 - 2008 Henning Brauer > >> + * Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org> > >> + * Copyright (c) 2015, 2016 Nicira, Inc. > >> + * All rights reserved. > >> + * > >> + * Redistribution and use in source and binary forms, with or > without > >> + * modification, are permitted provided that the following > conditions > >> + * are met: > >> + * > >> + * - Redistributions of source code must retain the above > copyright > >> + * notice, this list of conditions and the following > disclaimer. > >> + * - Redistributions in binary form must reproduce the above > >> + * copyright notice, this list of conditions and the > following > >> + * disclaimer in the documentation and/or other materials > provided > >> + * with the distribution. > >> + * > >> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND > >> CONTRIBUTORS > >> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT > NOT > >> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND > >> FITNESS > >> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE > >> + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, > >> INDIRECT, > >> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES > >> (INCLUDING, > >> + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR > >> SERVICES; > >> + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) > HOWEVER > >> + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, > >> STRICT > >> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING > IN > >> + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF > THE > >> + * POSSIBILITY OF SUCH DAMAGE. > >> + * > >> + * Effort sponsored in part by the Defense Advanced Research > Projects > >> + * Agency (DARPA) and Air Force Research Laboratory, Air Force > >> + * Materiel Command, USAF, under agreement number F30602-01-2- > 0537. > >> + * > >> + * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ > >> + */ > >> + > >> +#include <config.h> > >> + > >> +#include "conntrack-private.h" > >> +#include "ct-dpif.h" > >> +#include "dp-packet.h" > >> +#include "util.h" > >> + > >> +struct tcp_peer { > >> + enum ct_dpif_tcp_state state; > >> + uint32_t seqlo; /* Max sequence number > sent */ > >> + uint32_t seqhi; /* Max the other end > ACKd + win */ > >> + uint16_t max_win; /* largest window (pre > scaling) */ > >> + uint8_t wscale; /* window scaling > factor */ > >> +}; > >> + > >> +struct conn_tcp { > >> + struct conn up; > >> + struct tcp_peer peer[2]; > >> +}; > >> + > >> +enum { > >> + TCPOPT_EOL, > >> + TCPOPT_NOP, > >> + TCPOPT_WINDOW = 3, > >> +}; > >> + > >> +/* TCP sequence numbers are 32 bit integers operated > >> + * on with modular arithmetic. These macros can be > >> + * used to compare such integers. */ > >> +#define SEQ_LT(a,b) ((int)((a)-(b)) < 0) > >> +#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) > >> +#define SEQ_GT(a,b) ((int)((a)-(b)) > 0) > >> +#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) > >> + > >> +#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) > >> +#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) > >> + > >> +static struct conn_tcp* > >> +conn_tcp_cast(const struct conn* conn) > >> +{ > >> + return CONTAINER_OF(conn, struct conn_tcp, up); > >> +} > >> + > >> +/* pf does this in in pf_normalize_tcp(), and it is called only > if scrub > >> + * is enabled. We're not scrubbing, but this check seems > reasonable. */ > >> +static bool > >> +tcp_invalid_flags(uint16_t flags) > >> +{ > >> + > >> + if (flags & TCP_SYN) { > >> + if (flags & TCP_RST) { > >> + return true; > >> + } > >> + if (flags & TCP_FIN) { > >> + /* Here pf removes the fin flag. We simply mark the > packet as > >> + * invalid */ > >> + return true; > >> + } > >> + } else { > >> + /* Illegal packet */ > >> + if (!(flags & (TCP_ACK|TCP_RST))) { > >> + return true; > >> + } > >> + } > >> + > >> + if (!(flags & TCP_ACK)) { > >> + /* These flags are only valid if ACK is set */ > >> + if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & > TCP_URG)) { > >> + return true; > >> + } > >> + } > >> + > >> + return false; > >> +} > >> + > >> +#define TCP_MAX_WSCALE 14 > >> +#define CT_WSCALE_FLAG 0x80 > >> +#define CT_WSCALE_UNKNOWN 0x40 > >> +#define CT_WSCALE_MASK 0xf > >> + > >> +static uint8_t > >> +tcp_get_wscale(const struct tcp_header *tcp) > >> +{ > >> + int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp; > >> + const uint8_t *opt = (const uint8_t *)(tcp + 1); > >> + uint8_t wscale = 0; > >> + uint8_t optlen; > >> + > >> + while (len >= 3) { > >> + if (*opt == TCPOPT_EOL) { > >> + break; > >> + } > >> + switch (*opt) { > >> + case TCPOPT_NOP: > >> + opt++; > >> + len--; > >> + break; > >> + case TCPOPT_WINDOW: > >> + wscale = MIN(opt[2], TCP_MAX_WSCALE); > >> + wscale |= CT_WSCALE_FLAG; > >> + /* fall through */ > >> + default: > >> + optlen = opt[1]; > >> + if (optlen < 2) { > >> + optlen = 2; > >> + } > >> + len -= optlen; > >> + opt += optlen; > >> + } > >> + } > >> + > >> + return wscale; > >> +} > >> + > >> +static uint32_t > >> +tcp_payload_length(struct dp_packet *pkt) > >> +{ > >> + return (char *) dp_packet_tail(pkt) - > dp_packet_l2_pad_size(pkt) > >> + - (char *) dp_packet_get_tcp_payload(pkt); > >> +} > >> + > >> +static void > >> +update_expiration(struct conn_tcp *conn, long long now, long long > interval) > >> +{ > >> + conn->up.expiration = now + interval; > >> +} > >> + > >> + > >> +static enum ct_update_res > >> +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool > reply, > >> + long long now) > >> +{ > >> + struct conn_tcp *conn = conn_tcp_cast(conn_); > >> + struct tcp_header *tcp = dp_packet_l4(pkt); > >> + /* The peer that sent 'pkt' */ > >> + struct tcp_peer *src = &conn->peer[reply ? 1 : 0]; > >> + /* The peer that should receive 'pkt' */ > >> + struct tcp_peer *dst = &conn->peer[reply ? 0 : 1]; > >> + uint8_t sws = 0, dws = 0; > >> + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); > >> + > >> + uint16_t win = ntohs(tcp->tcp_winsz); > >> + uint32_t ack, end, seq, orig_seq; > >> + uint32_t p_len = tcp_payload_length(pkt); > >> + int ackskew; > >> + > >> + if (tcp_invalid_flags(tcp_flags)) { > >> + return CT_UPDATE_INVALID; > >> + } > >> + > >> + if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) && > >> + dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 && > >> + src->state >= CT_DPIF_TCPS_FIN_WAIT_2) { > >> + src->state = dst->state = CT_DPIF_TCPS_CLOSED; > >> + return CT_UPDATE_NEW; > >> + } > >> + > >> + if (src->wscale & CT_WSCALE_FLAG > >> + && dst->wscale & CT_WSCALE_FLAG > >> + && !(tcp_flags & TCP_SYN)) { > >> + > >> + sws = src->wscale & CT_WSCALE_MASK; > >> + dws = dst->wscale & CT_WSCALE_MASK; > >> + > >> + } else if (src->wscale & CT_WSCALE_UNKNOWN > >> + && dst->wscale & CT_WSCALE_UNKNOWN > >> + && !(tcp_flags & TCP_SYN)) { > >> + > >> + sws = TCP_MAX_WSCALE; > >> + dws = TCP_MAX_WSCALE; > >> + } > >> + > >> + /* > >> + * Sequence tracking algorithm from Guido van Rooij's paper: > >> + * http://www.madison- > gurkha.com/publications/tcp_filtering/ > >> + * tcp_filtering.ps > >> + */ > >> + > >> + orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq)); > >> + if (src->state < CT_DPIF_TCPS_SYN_SENT) { > >> + /* First packet from this end. Set its state */ > >> + > >> + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack)); > >> + > >> + end = seq + p_len; > >> + if (tcp_flags & TCP_SYN) { > >> + end++; > >> + if (dst->wscale & CT_WSCALE_FLAG) { > >> + src->wscale = tcp_get_wscale(tcp); > >> + if (src->wscale & CT_WSCALE_FLAG) { > >> + /* Remove scale factor from initial window */ > >> + sws = src->wscale & CT_WSCALE_MASK; > >> + win = DIV_ROUND_UP((uint32_t) win, 1 << sws); > >> + dws = dst->wscale & CT_WSCALE_MASK; > >> + } else { > >> + /* fixup other window */ > >> + dst->max_win <<= dst->wscale & > >> + CT_WSCALE_MASK; > >> + /* in case of a retrans SYN|ACK */ > >> + dst->wscale = 0; > >> + } > >> + } > >> + } > >> + if (tcp_flags & TCP_FIN) { > >> + end++; > >> + } > >> + > >> + src->seqlo = seq; > >> + src->state = CT_DPIF_TCPS_SYN_SENT; > >> + /* > >> + * May need to slide the window (seqhi may have been set > by > >> + * the crappy stack check or if we picked up the > connection > >> + * after establishment) > >> + */ > >> + if (src->seqhi == 1 || > >> + SEQ_GEQ(end + MAX(1, dst->max_win << dws), src- > >seqhi)) { > >> + src->seqhi = end + MAX(1, dst->max_win << dws); > >> + } > >> + if (win > src->max_win) { > >> + src->max_win = win; > >> + } > >> + > >> + } else { > >> + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack)); > >> + end = seq + p_len; > >> + if (tcp_flags & TCP_SYN) { > >> + end++; > >> + } > >> + if (tcp_flags & TCP_FIN) { > >> + end++; > >> + } > >> + } > >> + > >> + if ((tcp_flags & TCP_ACK) == 0) { > >> + /* Let it pass through the ack skew check */ > >> + ack = dst->seqlo; > >> + } else if ((ack == 0 > >> + && (tcp_flags & (TCP_ACK|TCP_RST)) == > (TCP_ACK|TCP_RST)) > >> + /* broken tcp stacks do not set ack */) { > >> + /* Many stacks (ours included) will set the ACK number in > an > >> + * FIN|ACK if the SYN times out -- no sequence to ACK. */ > >> + ack = dst->seqlo; > >> + } > >> + > >> + if (seq == end) { > >> + /* Ease sequencing restrictions on no data packets */ > >> + seq = src->seqlo; > >> + end = seq; > >> + } > >> + > >> + ackskew = dst->seqlo - ack; > >> +#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary > fudge > >> factor */ > >> + if (SEQ_GEQ(src->seqhi, end) > >> + /* Last octet inside other's window space */ > >> + && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) > >> + /* Retrans: not more than one window back */ > >> + && (ackskew >= -MAXACKWINDOW) > >> + /* Acking not more than one reassembled fragment > backwards */ > >> + && (ackskew <= (MAXACKWINDOW << sws)) > >> + /* Acking not more than one window forward */ > >> + && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo > >> + || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == > src->seqlo))) { > >> + /* Require an exact/+1 sequence match on resets when > possible */ > >> + > >> + /* update max window */ > >> + if (src->max_win < win) { > >> + src->max_win = win; > >> + } > >> + /* synchronize sequencing */ > >> + if (SEQ_GT(end, src->seqlo)) { > >> + src->seqlo = end; > >> + } > >> + /* slide the window of what the other end can send */ > >> + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { > >> + dst->seqhi = ack + MAX((win << sws), 1); > >> + } > >> + > >> + /* update states */ > >> + if (tcp_flags & TCP_SYN && src->state < > CT_DPIF_TCPS_SYN_SENT) { > >> + src->state = CT_DPIF_TCPS_SYN_SENT; > >> + } > >> + if (tcp_flags & TCP_FIN && src->state < > CT_DPIF_TCPS_CLOSING) { > >> + src->state = CT_DPIF_TCPS_CLOSING; > >> + } > >> + if (tcp_flags & TCP_ACK) { > >> + if (dst->state == CT_DPIF_TCPS_SYN_SENT) { > >> + dst->state = CT_DPIF_TCPS_ESTABLISHED; > >> + } else if (dst->state == CT_DPIF_TCPS_CLOSING) { > >> + dst->state = CT_DPIF_TCPS_FIN_WAIT_2; > >> + } > >> + } > >> + if (tcp_flags & TCP_RST) { > >> + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; > >> + } > >> + > >> + if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2 > >> + && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) { > >> + update_expiration(conn, now, 30 * 1000); > >> + } else if (src->state >= CT_DPIF_TCPS_CLOSING > >> + && dst->state >= CT_DPIF_TCPS_CLOSING) { > >> + update_expiration(conn, now, 45 * 1000); > >> + } else if (src->state < CT_DPIF_TCPS_ESTABLISHED > >> + || dst->state < CT_DPIF_TCPS_ESTABLISHED) { > >> + update_expiration(conn, now, 30 * 1000); > >> + } else if (src->state >= CT_DPIF_TCPS_CLOSING > >> + || dst->state >= CT_DPIF_TCPS_CLOSING) { > >> + update_expiration(conn, now, 15 * 60 * 1000); > >> + } else { > >> + update_expiration(conn, now, 24 * 60 * 60 * 1000); > >> + } > >> + } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT > >> + || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 > >> + || src->state >= CT_DPIF_TCPS_FIN_WAIT_2) > >> + && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) > >> + /* Within a window forward of the originating > packet */ > >> + && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { > >> + /* Within a window backward of the originating > packet */ > >> + > >> + /* > >> + * This currently handles three situations: > >> + * 1) Stupid stacks will shotgun SYNs before their peer > >> + * replies. > >> + * 2) When PF catches an already established stream (the > >> + * firewall rebooted, the state table was flushed, > routes > >> + * changed...) > >> + * 3) Packets get funky immediately after the connection > >> + * closes (this should catch Solaris spurious > ACK|FINs > >> + * that web servers like to spew after a close) > >> + * > >> + * This must be a little more careful than the above code > >> + * since packet floods will also be caught here. We don't > >> + * update the TTL here to mitigate the damage of a packet > >> + * flood and so the same code can handle awkward > establishment > >> + * and a loosened connection close. > >> + * In the establishment case, a correct peer response > will > >> + * validate the connection, go through the normal state > code > >> + * and keep updating the state TTL. > >> + */ > >> + > >> + /* update max window */ > >> + if (src->max_win < win) { > >> + src->max_win = win; > >> + } > >> + /* synchronize sequencing */ > >> + if (SEQ_GT(end, src->seqlo)) { > >> + src->seqlo = end; > >> + } > >> + /* slide the window of what the other end can send */ > >> + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { > >> + dst->seqhi = ack + MAX((win << sws), 1); > >> + } > >> + > >> + /* > >> + * Cannot set dst->seqhi here since this could be a > shotgunned > >> + * SYN and not an already established connection. > >> + */ > >> + > >> + if (tcp_flags & TCP_FIN && src->state < > CT_DPIF_TCPS_CLOSING) { > >> + src->state = CT_DPIF_TCPS_CLOSING; > >> + } > >> + > >> + if (tcp_flags & TCP_RST) { > >> + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; > >> + } > >> + } else { > >> + return CT_UPDATE_INVALID; > >> + } > >> + > >> + return CT_UPDATE_VALID; > >> +} > >> + > >> +static bool > >> +tcp_valid_new(struct dp_packet *pkt) > >> +{ > >> + struct tcp_header *tcp = dp_packet_l4(pkt); > >> + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); > >> + > >> + if (tcp_invalid_flags(tcp_flags)) { > >> + return false; > >> + } > >> + > >> + /* A syn+ack is not allowed to create a connection. We want > to allow > >> + * totally new connections (syn) or already established, not > partially > >> + * open (syn+ack). */ > >> + if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) { > >> + return false; > >> + } > >> + > >> + return true; > >> +} > >> + > >> +static struct conn * > >> +tcp_new_conn(struct dp_packet *pkt, long long now) > >> +{ > >> + struct conn_tcp* newconn = NULL; > >> + struct tcp_header *tcp = dp_packet_l4(pkt); > >> + struct tcp_peer *src, *dst; > >> + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); > >> + > >> + newconn = xzalloc(sizeof(struct conn_tcp)); > >> + > >> + src = &newconn->peer[0]; > >> + dst = &newconn->peer[1]; > >> + > >> + src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq)); > >> + src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1; > >> + > >> + if (tcp_flags & TCP_SYN) { > >> + src->seqhi++; > >> + src->wscale = tcp_get_wscale(tcp); > >> + } else { > >> + src->wscale = CT_WSCALE_UNKNOWN; > >> + dst->wscale = CT_WSCALE_UNKNOWN; > >> + } > >> + src->max_win = MAX(ntohs(tcp->tcp_winsz), 1); > >> + if (src->wscale & CT_WSCALE_MASK) { > >> + /* Remove scale factor from initial window */ > >> + uint8_t sws = src->wscale & CT_WSCALE_MASK; > >> + src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << > sws); > >> + } > >> + if (tcp_flags & TCP_FIN) { > >> + src->seqhi++; > >> + } > >> + dst->seqhi = 1; > >> + dst->max_win = 1; > >> + src->state = CT_DPIF_TCPS_SYN_SENT; > >> + dst->state = CT_DPIF_TCPS_CLOSED; > >> + > >> + update_expiration(newconn, now, 30 * 1000); > >> + > >> + return &newconn->up; > >> +} > >> + > >> +struct ct_l4_proto ct_proto_tcp = { > >> + .new_conn = tcp_new_conn, > >> + .valid_new = tcp_valid_new, > >> + .conn_update = tcp_conn_update, > >> +}; > >> diff --git a/lib/conntrack.c b/lib/conntrack.c > >> new file mode 100644 > >> index 0000000..840335b > >> --- /dev/null > >> +++ b/lib/conntrack.c > >> @@ -0,0 +1,851 @@ > >> +/* > >> + * Copyright (c) 2015, 2016 Nicira, Inc. > >> + * > >> + * Licensed under the Apache License, Version 2.0 (the > "License"); > >> + * you may not use this file except in compliance with the > License. > >> + * You may obtain a copy of the License at: > >> + * > >> + * http://www.apache.org/licenses/LICENSE-2.0 > >> + * > >> + * Unless required by applicable law or agreed to in writing, > software > >> + * distributed under the License is distributed on an "AS IS" > BASIS, > >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express > or > >> implied. > >> + * See the License for the specific language governing > permissions and > >> + * limitations under the License. > >> + */ > >> + > >> +#include <config.h> > >> +#include "conntrack.h" > >> + > >> +#include <errno.h> > >> +#include <sys/types.h> > >> +#include <netinet/in.h> > >> +#include <netinet/icmp6.h> > >> + > >> +#include "bitmap.h" > >> +#include "conntrack-private.h" > >> +#include "dp-packet.h" > >> +#include "flow.h" > >> +#include "hmap.h" > >> +#include "netdev.h" > >> +#include "odp-netlink.h" > >> +#include "openvswitch/vlog.h" > >> +#include "ovs-rcu.h" > >> +#include "random.h" > >> +#include "timeval.h" > >> + > >> +VLOG_DEFINE_THIS_MODULE(conntrack); > >> + > >> +struct conn_lookup_ctx { > >> + struct conn_key key; > >> + struct conn *conn; > >> + uint32_t hash; > >> + bool reply; > >> + bool related; > >> +}; > >> + > >> +static bool conn_key_extract(struct conntrack *, struct dp_packet > *, > >> + struct conn_lookup_ctx *, uint16_t > zone); > >> +static uint32_t conn_key_hash(const struct conn_key *, uint32_t > basis); > >> +static void conn_key_reverse(struct conn_key *); > >> +static void conn_key_lookup(struct conntrack *ct, > >> + struct conn_lookup_ctx *ctx, > >> + unsigned bucket, > >> + long long now); > >> +static bool valid_new(struct dp_packet *pkt, struct conn_key *); > >> +static struct conn *new_conn(struct dp_packet *pkt, struct > conn_key *, > >> + long long now); > >> +static void delete_conn(struct conn *); > >> +static enum ct_update_res conn_update(struct conn *, struct > dp_packet*, > >> + bool reply, long long now); > >> +static bool conn_expired(struct conn *, long long now); > >> +static void set_mark(struct dp_packet *, struct conn *, > >> + uint32_t val, uint32_t mask); > >> +static void set_label(struct dp_packet *, struct conn *, > >> + const struct ovs_key_ct_labels *val, > >> + const struct ovs_key_ct_labels *mask); > >> + > >> +static struct ct_l4_proto *l4_protos[] = { > >> + [IPPROTO_TCP] = &ct_proto_tcp, > >> + [IPPROTO_UDP] = &ct_proto_other, > >> + [IPPROTO_ICMP] = &ct_proto_other, > >> +}; > >> + > >> +/* Initializes the connection tracker 'ct'. The caller is > responbile for > >> + * calling 'conntrack_destroy()', when the instance is not needed > anymore */ > >> +void > >> +conntrack_init(struct conntrack *ct) > >> +{ > >> + unsigned i; > >> + > >> + for (i = 0; i < CONNTRACK_BUCKETS; i++) { > >> + ct_lock_init(&ct->locks[i]); > >> + ct_lock_lock(&ct->locks[i]); > >> + hmap_init(&ct->connections[i]); > >> + ct_lock_unlock(&ct->locks[i]); > >> + } > >> + ct->hash_basis = random_uint32(); > >> + ct->purge_bucket = 0; > >> + ct->purge_inner_bucket = 0; > >> + ct->purge_inner_offset = 0; > >> +} > >> + > >> +/* Destroys the connection tracker 'ct' and frees all the > allocated memory. */ > >> +void > >> +conntrack_destroy(struct conntrack *ct) > >> +{ > >> + unsigned i; > >> + > >> + for (i = 0; i < CONNTRACK_BUCKETS; i++) { > >> + struct conn *conn, *next; > >> + > >> + ct_lock_lock(&ct->locks[i]); > >> + HMAP_FOR_EACH_SAFE(conn, next, node, &ct->connections[i]) > { > >> + hmap_remove(&ct->connections[i], &conn->node); > >> + delete_conn(conn); > >> + } > >> + hmap_destroy(&ct->connections[i]); > >> + ct_lock_unlock(&ct->locks[i]); > >> + ct_lock_destroy(&ct->locks[i]); > >> + } > >> +} > >> + > > > >> +static unsigned hash_to_bucket(uint32_t hash) > >> +{ > >> + /* Extracts the most significant bits in hash. The least > significant bits > >> + * are already used internally by the hmap implementation. */ > >> + BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && > >> CONNTRACK_BUCKETS_SHIFT >= 1); > >> + > >> + return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % > >> CONNTRACK_BUCKETS; > >> +} > >> + > >> +static void > >> +write_ct_md(struct dp_packet *pkt, uint8_t state, uint16_t zone, > >> + uint32_t mark, ovs_u128 label) > >> +{ > >> + pkt->md.ct_state = state | CS_TRACKED; > >> + pkt->md.ct_zone = zone; > >> + pkt->md.ct_mark = mark; > >> + pkt->md.ct_label = label; > >> +} > >> + > >> +static struct conn * > >> +conn_not_found(struct conntrack *ct, struct dp_packet *pkt, > >> + struct conn_lookup_ctx *ctx, uint8_t *state, bool > commit, > >> + long long now) > >> +{ > >> + unsigned bucket = hash_to_bucket(ctx->hash); > >> + struct conn *nc = NULL; > >> + > >> + if (!valid_new(pkt, &ctx->key)) { > >> + *state |= CS_INVALID; > >> + return nc; > >> + } > >> + > >> + *state |= CS_NEW; > >> + > >> + if (commit) { > >> + nc = new_conn(pkt, &ctx->key, now); > >> + > >> + memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key); > >> + > >> + conn_key_reverse(&nc->rev_key); > >> + hmap_insert(&ct->connections[bucket], &nc->node, ctx- > >hash); > >> + } > >> + > >> + return nc; > >> +} > >> + > >> +static struct conn * > >> +process_one(struct conntrack *ct, struct dp_packet *pkt, > >> + struct conn_lookup_ctx *ctx, uint16_t zone, > >> + bool commit, long long now) > >> +{ > >> + unsigned bucket = hash_to_bucket(ctx->hash); > >> + struct conn *conn = ctx->conn; > >> + uint8_t state = 0; > >> + > >> + if (conn) { > >> + if (ctx->related) { > >> + state |= CS_RELATED; > >> + if (ctx->reply) { > >> + state |= CS_REPLY_DIR; > >> + } > >> + } else { > >> + enum ct_update_res res; > >> + > >> + res = conn_update(conn, pkt, ctx->reply, now); > >> + > >> + switch (res) { > >> + case CT_UPDATE_VALID: > >> + state |= CS_ESTABLISHED; > >> + if (ctx->reply) { > >> + state |= CS_REPLY_DIR; > >> + } > >> + break; > >> + case CT_UPDATE_INVALID: > >> + state |= CS_INVALID; > >> + break; > >> + case CT_UPDATE_NEW: > >> + hmap_remove(&ct->connections[bucket], &conn- > >node); > >> + delete_conn(conn); > >> + conn = conn_not_found(ct, pkt, ctx, &state, > commit, now); > >> + break; > > > >[Antonio F] > >I see that conn_update can return just CT_UPDATE_VALID/INVALID/NEW > >but should we consider a 'default' case here, eg > >default: > > state |= CS_INVALID; > > break; > >? > > I'm not too worried about that because if we add another value to the > enum without explicitly handling it here we will get a warning. > > Is there a particular case you're worried about?
[Antonio F] No, there's no particular case. It's just for a safer approach to handle any possible situation, future changes and so on. > > > > > > >> + } > >> + } > >> + > >> + pkt->md.ct_label = conn->label; > >> + pkt->md.ct_mark = conn->mark; > > > >[Antonio F] > >Can we skip the previous 2 lines as they are implemented into > write_ct_md ? > > Yes we can, thank you for spotting this! [Antonio F] np > > > > > > >> + write_ct_md(pkt, state, zone, conn->mark, conn->label); > >> + } else { > >> + conn = conn_not_found(ct, pkt, ctx, &state, commit, now); > >> + write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}}); > >> + } > >> + > >> + return conn; > >> +} > >> + > >> +/* Sends a group of 'cnt' packets ('pkts') through the connection > tracker > >> + * 'ct'. If 'commit' is true, the packets are allowed to create > new entries > >> + * in the connection tables. 'setmark', if not NULL, should > point to a two > >> + * elements array containing a value and a mask to set the > connection mark. > >> + * 'setlabel' behaves similarly for the connection label.*/ > >> +int > >> +conntrack_execute(struct conntrack *ct, struct dp_packet **pkts, > size_t cnt, > >> + bool commit, uint16_t zone, const uint32_t > *setmark, > >> + const struct ovs_key_ct_labels *setlabel, > >> + const char *helper) > >> +{ > >> +#if !defined(__CHECKER__) && !defined(_WIN32) > >> + const size_t KEY_ARRAY_SIZE = cnt; > >> +#else > >> + enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST }; > >> +#endif > >> + struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE]; > >> + int8_t bucket_list[CONNTRACK_BUCKETS]; > >> + struct { > >> + unsigned bucket; > >> + unsigned long maps; > >> + } arr[KEY_ARRAY_SIZE]; > >> + long long now = time_msec(); > >> + size_t i = 0; > >> + uint8_t arrcnt = 0; > >> + > >> + BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= > >> NETDEV_MAX_BURST); > >> + > >> + if (helper) { > >> + static struct vlog_rate_limit rl = > VLOG_RATE_LIMIT_INIT(5, 5); > >> + > >> + VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", > helper); > >> + /* Continue without the helper */ > >> + } > >> + > >> + memset(bucket_list, INT8_C(-1), sizeof bucket_list); > >> + for (i = 0; i < cnt; i++) { > >> + unsigned bucket; > >> + > >> + if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) { > >> + write_ct_md(pkts[i], CS_INVALID, zone, 0, > (ovs_u128){{0}}); > >> + continue; > >> + } > >> + > >> + bucket = hash_to_bucket(ctxs[i].hash); > >> + if (bucket_list[bucket] == INT8_C(-1)) { > >> + bucket_list[bucket] = arrcnt; > >> + > >> + arr[arrcnt].maps = 0; > >> + ULLONG_SET1(arr[arrcnt].maps, i); > >> + arr[arrcnt++].bucket = bucket; > >> + } else { > >> + ULLONG_SET1(arr[bucket_list[bucket]].maps, i); > >> + arr[bucket_list[bucket]].maps |= 1UL << i; > >> + } > >> + } > >> + > >> + for (i = 0; i < arrcnt; i++) { > >> + size_t j; > >> + > >> + ct_lock_lock(&ct->locks[arr[i].bucket]); > >> + > >> + ULLONG_FOR_EACH_1(j, arr[i].maps) { > >> + struct conn *conn; > >> + > >> + conn_key_lookup(ct, &ctxs[j], arr[i].bucket, now); > >> + > >> + conn = process_one(ct, pkts[j], &ctxs[j], zone, > commit, now); > >> + > >> + if (conn && setmark) { > >> + set_mark(pkts[j], conn, setmark[0], setmark[1]); > >> + } > >> + > >> + if (conn && setlabel) { > >> + set_label(pkts[j], conn, &setlabel[0], > &setlabel[1]); > >> + } > >> + } > >> + ct_lock_unlock(&ct->locks[arr[i].bucket]); > >> + } > >> + > >> + return 0; > >> +} > >> + > >> +static void > >> +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, > uint32_t > >> mask) > >> +{ > >> + pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask)); > >> + conn->mark = pkt->md.ct_mark; > >> +} > >> + > >> +static void > >> +set_label(struct dp_packet *pkt, struct conn *conn, > >> + const struct ovs_key_ct_labels *val, > >> + const struct ovs_key_ct_labels *mask) > >> +{ > >> + ovs_u128 v, m; > >> + > >> + memcpy(&v, val, sizeof v); > >> + memcpy(&m, mask, sizeof m); > >> + > >> + pkt->md.ct_label.u64.lo = v.u64.lo > >> + | (pkt->md.ct_label.u64.lo & > ~(m.u64.lo)); > >> + pkt->md.ct_label.u64.hi = v.u64.hi > >> + | (pkt->md.ct_label.u64.hi & > ~(m.u64.hi)); > >> + conn->label = pkt->md.ct_label; > >> +} > >> + > > > >> +#define CONNTRACK_PURGE_NUM 256 > >> + > >> +static void > >> +sweep_bucket(struct hmap *bucket, uint32_t *inner_bucket, > >> + uint32_t *inner_offset, unsigned *left, long long > now) > >> +{ > >> + while (*left != 0) { > >> + struct hmap_node *node; > >> + struct conn *conn; > >> + > >> + node = hmap_at_position(bucket, inner_bucket, > inner_offset); > >> + > >> + if (!node) { > >> + hmap_shrink(bucket); > >> + break; > >> + } > >> + > >> + INIT_CONTAINER(conn, node, node); > >> + if (conn_expired(conn, now)) { > >> + hmap_remove(bucket, &conn->node); > >> + delete_conn(conn); > >> + (*left)--; > >> + } > >> + } > >> +} > >> + > >> +/* Cleans up old connection entries. Should be called > periodically. */ > >> +void > >> +conntrack_run(struct conntrack *ct) > >> +{ > >> + unsigned bucket = hash_to_bucket(ct->purge_bucket); > >> + uint32_t inner_bucket = ct->purge_inner_bucket, > >> + inner_offset = ct->purge_inner_offset; > >> + unsigned left = CONNTRACK_PURGE_NUM; > >> + long long now = time_msec(); > >> + > >> + while (bucket < CONNTRACK_BUCKETS) { > >> + ct_lock_lock(&ct->locks[bucket]); > >> + sweep_bucket(&ct->connections[bucket], > >> + &inner_bucket, &inner_offset, > >> + &left, now); > >> + ct_lock_unlock(&ct->locks[bucket]); > >> + > >> + if (left == 0) { > >> + break; > >> + } else { > >> + bucket++; > >> + } > >> + } > >> + > >> + ct->purge_bucket = bucket; > >> + ct->purge_inner_bucket = inner_bucket; > >> + ct->purge_inner_offset = inner_offset; > >> +} > >> + > > > >> +/* Key extraction */ > >> + > >> +/* The function stores a pointer to the first byte after the > header in > >> + * '*new_data', if 'new_data' is not NULL. If it is NULL, the > caller is > >> + * not interested in the header's tail, meaning that the header > has > >> + * already been parsed (e.g. by flow_extract): we take this as a > hint to > >> + * save a few checks. */ > >> +static inline bool > >> +extract_l3_ipv4(struct conn_key *key, const void *data, size_t > size, > >> + const char **new_data) > >> +{ > >> + const struct ip_header *ip = data; > >> + > >> + if (new_data) { > >> + size_t ip_len; > >> + > >> + if (OVS_UNLIKELY(size < IP_HEADER_LEN)) { > >> + return false; > >> + } > >> + ip_len = IP_IHL(ip->ip_ihl_ver) * 4; > >> + > >> + if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) { > >> + return false; > >> + } > >> + if (OVS_UNLIKELY(size < ip_len)) { > >> + return false; > >> + } > >> + > >> + *new_data = (char *) data + ip_len; > >> + } > >> + > >> + if (IP_IS_FRAGMENT(ip->ip_frag_off)) { > >> + return false; > >> + } > >> + > >> + key->src.addr.ipv4 = ip->ip_src; > >> + key->dst.addr.ipv4 = ip->ip_dst; > >> + key->nw_proto = ip->ip_proto; > >> + > >> + return true; > >> +} > >> + > >> +/* The function stores a pointer to the first byte after the > header in > >> + * '*new_data', if 'new_data' is not NULL. If it is NULL, the > caller is > >> + * not interested in the header's tail, meaning that the header > has > >> + * already been parsed (e.g. by flow_extract): we take this as a > hint to > >> + * save a few checks. */ > >> +static inline bool > >> +extract_l3_ipv6(struct conn_key *key, const void *data, size_t > size, > >> + const char **new_data) > >> +{ > >> + const struct ovs_16aligned_ip6_hdr *ip6 = data; > >> + uint8_t nw_proto = ip6->ip6_nxt; > >> + uint8_t nw_frag = 0; > >> + > >> + if (new_data) { > >> + if (OVS_UNLIKELY(size < sizeof *ip6)) { > >> + return false; > >> + } > >> + } > >> + > >> + data = ip6 + 1; > >> + size -= sizeof *ip6; > >> + > >> + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) > { > >> + return false; > >> + } > >> + > >> + if (new_data) { > >> + *new_data = data; > >> + } > >> + > >> + if (nw_frag) { > >> + return false; > >> + } > >> + > >> + key->src.addr.ipv6 = ip6->ip6_src; > >> + key->dst.addr.ipv6 = ip6->ip6_dst; > >> + key->nw_proto = nw_proto; > >> + > >> + return true; > >> +} > >> + > >> +static inline bool > >> +check_l4_tcp(const void *data, size_t size) > >> +{ > >> + const struct tcp_header *tcp = data; > >> + size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4; > >> + > >> + if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= size)) > { > >> + return true; > >> + } > >> + > >> + return false; > >> +} > >> + > >> +static inline bool > >> +extract_l4_tcp(struct conn_key *key, const void *data, size_t > size) > >> +{ > >> + const struct tcp_header *tcp = data; > >> + > >> + if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) { > >> + return false; > >> + } > >> + > >> + key->src.port = tcp->tcp_src; > >> + key->dst.port = tcp->tcp_dst; > >> + > >> + return true; > >> +} > >> + > >> +static inline bool > >> +extract_l4_udp(struct conn_key *key, const void *data, size_t > size) > >> +{ > >> + const struct udp_header *udp = data; > >> + > >> + if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) { > >> + return false; > >> + } > >> + > >> + key->src.port = udp->udp_src; > >> + key->dst.port = udp->udp_dst; > >> + > >> + return true; > >> +} > >> + > >> +static inline bool extract_l4(struct conn_key *key, const void > *data, > >> + size_t size, bool *related); > >> + > >> +/* If 'related' is not NULL and the function is processing an > ICMP > >> + * error packet, extract the l3 and l4 fields from the nested > header > >> + * instead and set *related to true. If 'related' is NULL we're > >> + * already processing a nested header and no such recursion is > >> + * possible */ > >> +static inline int > >> +extract_l4_icmp(struct conn_key *key, const void *data, size_t > size, > >> + bool *related) > >> +{ > >> + const struct icmp_header *icmp = data; > >> + > >> + if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) { > >> + return false; > >> + } > >> + > >> + switch (icmp->icmp_type) { > >> + case ICMP4_ECHO_REQUEST: > >> + case ICMP4_ECHO_REPLY: > >> + case ICMP4_TIMESTAMP: > >> + case ICMP4_TIMESTAMPREPLY: > >> + case ICMP4_INFOREQUEST: > >> + case ICMP4_INFOREPLY: > >> + /* Separate ICMP connection: identified using id */ > >> + key->src.port = key->dst.port = icmp- > >icmp_fields.echo.id; > >> + break; > >> + case ICMP4_DST_UNREACH: > >> + case ICMP4_TIME_EXCEEDED: > >> + case ICMP4_PARAM_PROB: > >> + case ICMP4_SOURCEQUENCH: > >> + case ICMP4_REDIRECT: { > >> + /* ICMP packet part of another connection. We should > >> + * extract the key from embedded packet header */ > >> + struct conn_key inner_key; > >> + const char *l3 = (const char *) (icmp + 1); > >> + const char *tail = (const char *) data + size; > >> + const char *l4; > >> + bool ok; > >> + > >> + if (!related) { > >> + return false; > >> + } > >> + *related = true; > >> + > >> + memset(&inner_key, 0, sizeof inner_key); > >> + inner_key.dl_type = htons(ETH_TYPE_IP); > >> + ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4); > >> + if (!ok) { > >> + return false; > >> + } > >> + > >> + /* pf doesn't do this, but it seems a good idea */ > >> + if (inner_key.src.addr.ipv4_aligned != key- > >dst.addr.ipv4_aligned > >> + || inner_key.dst.addr.ipv4_aligned != key- > >src.addr.ipv4_aligned) { > >> + return false; > >> + } > >> + > >> + key->src = inner_key.src; > >> + key->dst = inner_key.dst; > >> + key->nw_proto = inner_key.nw_proto; > >> + > >> + ok = extract_l4(key, l4, tail - l4, NULL); > >> + if (ok) { > >> + conn_key_reverse(key); > >> + } > >> + return ok; > >> + } > >> + default: > >> + return false; > >> + } > >> + > >> + return true; > >> +} > >> + > >> +/* If 'related' is not NULL and the function is processing an > ICMP > >> + * error packet, extract the l3 and l4 fields from the nested > header > >> + * instead and set *related to true. If 'related' is NULL we're > >> + * already processing a nested header and no such recursion is > >> + * possible */ > >> +static inline bool > >> +extract_l4_icmp6(struct conn_key *key, const void *data, size_t > size, > >> + bool *related) > >> +{ > >> + const struct icmp6_header *icmp6 = data; > >> + > >> + /* All the messages that we support need at least 4 bytes > after > >> + * the header */ > >> + if (size < sizeof *icmp6 + 4) { > >> + return false; > >> + } > >> + > >> + switch (icmp6->icmp6_type) { > >> + case ICMP6_ECHO_REQUEST: > >> + case ICMP6_ECHO_REPLY: > >> + /* Separate ICMP connection: identified using id */ > >> + key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + > 1); > >> + break; > >> + case ICMP6_DST_UNREACH: > >> + case ICMP6_PACKET_TOO_BIG: > >> + case ICMP6_TIME_EXCEEDED: > >> + case ICMP6_PARAM_PROB:{ > >> + /* ICMP packet part of another connection. We should > >> + * extract the key from embedded packet header */ > >> + struct conn_key inner_key; > >> + const char *l3 = (const char *) icmp6 + 8; > >> + const char *tail = (const char *) data + size; > >> + const char *l4 = NULL; > >> + bool ok; > >> + > >> + if (!related) { > >> + return false; > >> + } > >> + *related = true; > >> + > >> + memset(&inner_key, 0, sizeof inner_key); > >> + inner_key.dl_type = htons(ETH_TYPE_IPV6); > >> + ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4); > >> + if (!ok) { > >> + return false; > >> + } > >> + > >> + /* pf doesn't do this, but it seems a good idea */ > >> + if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned, > >> + &key->dst.addr.ipv6_aligned) > >> + || > !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned, > >> + &key->src.addr.ipv6_aligned)) { > >> + return false; > >> + } > >> + > >> + key->src = inner_key.src; > >> + key->dst = inner_key.dst; > >> + key->nw_proto = inner_key.nw_proto; > >> + > >> + ok = extract_l4(key, l4, tail - l4, NULL); > >> + if (ok) { > >> + conn_key_reverse(key); > >> + } > >> + return ok; > >> + } > >> + default: > >> + return false; > >> + } > >> + > >> + return true; > >> +} > >> + > >> +/* Extract l4 fields into 'key', which must already contain valid > l3 > >> + * members. If 'related' is not NULL and an ICMP error packet is > being > >> + * processed, the function will extract the key from the packet > nested > >> + * in the ICMP paylod and set '*related' to true. If 'related' > is NULL, > >> + * nested parsing isn't allowed. This is necessary to limit the > >> + * recursion level. */ > >> +static inline bool > >> +extract_l4(struct conn_key *key, const void *data, size_t size, > bool *related) > >> +{ > >> + if (key->nw_proto == IPPROTO_TCP) { > >> + return extract_l4_tcp(key, data, size) > >> + && (!related || check_l4_tcp(data, size)); > >> + } else if (key->nw_proto == IPPROTO_UDP) { > >> + return extract_l4_udp(key, data, size); > >> + } else if (key->dl_type == htons(ETH_TYPE_IP) > >> + && key->nw_proto == IPPROTO_ICMP) { > >> + return extract_l4_icmp(key, data, size, related); > >> + } else if (key->dl_type == htons(ETH_TYPE_IPV6) > >> + && key->nw_proto == IPPROTO_ICMPV6) { > >> + return extract_l4_icmp6(key, data, size, related); > >> + } else { > >> + return false; > >> + } > >> +} > >> + > >> +static bool > >> +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, > >> + struct conn_lookup_ctx *ctx, uint16_t zone) > >> +{ > >> + const struct eth_header *l2 = dp_packet_l2(pkt); > >> + const struct ip_header *l3 = dp_packet_l3(pkt); > >> + const char *l4 = dp_packet_l4(pkt); > >> + const char *tail = dp_packet_tail(pkt); > >> + bool ok; > >> + > >> + memset(ctx, 0, sizeof *ctx); > >> + > >> + if (!l2 || !l3 || !l4) { > >> + return false; > >> + } > >> + > >> + ctx->key.zone = zone; > >> + > >> + /* XXX In this function we parse the packet (again, it has > already > >> + * gone through miniflow_extract()) for two reasons: > >> + * > >> + * 1) To extract the l3 addresses and l4 ports. > >> + * We already have the l3 and l4 headers' pointers. > Extracting > >> + * the l3 addresses and the l4 ports is really cheap, > since they > >> + * can be found at fixed locations. > >> + * 2) To extract the l3 and l4 types. > >> + * Extracting the l3 and l4 types (especially the l3[1]) > on the > >> + * other hand is quite expensive, because they're not at a > >> + * fixed location. > >> + * > >> + * Here's a way to avoid (2) with the help of the datapath. > >> + * The datapath doesn't keep the packet's extracted flow[2], > so > >> + * using that is not an option. We could use the packet's > matching > >> + * megaflow, but we have to make sure that the l3 and l4 > types > >> + * are unwildcarded. This means either: > >> + * > >> + * a) dpif-netdev unwildcards the l3 (and l4) types when a > new flow > >> + * is installed if the actions contains ct(). This is > what the > >> + * kernel datapath does. It is not so straightforward, > though. > >> + * > >> + * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types > when > >> + * translating a ct() action. This is already done in > different > >> + * actions and since both the userspace and the kernel > datapath > >> + * would benefit from it, it seems an appropriate place to > do > >> + * it. > >> + * > >> + * --- > >> + * [1] A simple benchmark (running only the connection > tracker > >> + * over and over on the same packets) shows that if the > >> + * l3 type is already provided we are 15% faster (running > the > >> + * connection tracker over a couple of DPDK devices with > a > >> + * stream of UDP 64-bytes packets shows that we are 4% > faster). > >> + * > >> + * [2] The reasons for this are that keeping the flow > increases > >> + * (slightly) the cache footprint and increases > computation > >> + * time as we move the packet around. Most importantly, > the flow > >> + * should be updated by the actions and this can be slow, > as > >> + * we use a sparse representation (miniflow). > >> + * > >> + */ > >> + ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) > l2); > >> + if (ctx->key.dl_type == htons(ETH_TYPE_IP)) { > > > >[Antonio F] In case we specify a flow with VLAN + CT like > >add_flow brX table=0,in_port=1,vlan_tci=0xf123,tcp,ct_state=- > trk,action=ct(commit,zone=9),2 > >should we also consider ETH_TYPE_VLAN in the previous conditional > statement? > > parse_dl_type() should skip vlans and return the L3 type. The > connection tracker > is interested in what's beyond the vlan tag. [Antonio F] ok, got it. > > > > > > >> + ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, > NULL); > >> + } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) { > >> + ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, > NULL); > >> + } else { > >> + ok = false; > >> + } > >> + > >> + if (ok) { > >> + if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related)) > { > >> + ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis); > >> + return true; > >> + } > >> + } > >> + > >> + return false; > >> +} > >> + > > > >> +/* Symmetric */ > >> +static uint32_t > >> +conn_key_hash(const struct conn_key *key, uint32_t basis) > >> +{ > >> + uint32_t hsrc, hdst, hash; > >> + int i; > >> + > >> + hsrc = hdst = basis; > >> + > >> + for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) { > >> + hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]); > >> + hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]); > >> + } > >> + > >> + hash = hsrc ^ hdst; > >> + > >> + hash = hash_words((uint32_t *) &key->dst + 1, > >> + (uint32_t *) (key + 1) - (uint32_t *) > (&key->dst + 1), > >> + hash); > > > >[Antonio F] > >Is that to involve dl_type, nw_proto and zone in the hash > computation? > > Yes, I'll add a comment to make that clear > > > > > > >> + > >> + return hash; > >> +} > >> + > >> +static void > >> +conn_key_reverse(struct conn_key *key) > >> +{ > >> + struct ct_endpoint tmp; > >> + tmp = key->src; > >> + key->src = key->dst; > >> + key->dst = tmp; > >> +} > >> + > >> +static void > >> +conn_key_lookup(struct conntrack *ct, > >> + struct conn_lookup_ctx *ctx, > >> + unsigned bucket, > >> + long long now) > >> +{ > >> + struct conn *conn, *found = NULL; > >> + uint32_t hash = ctx->hash; > >> + bool reply; > >> + > >> + HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ct- > >> >connections[bucket]) { > >> + if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))) { > >> + found = conn; > >> + reply = false; > >> + break; > >> + } > >> + if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn- > >rev_key))) { > >> + found = conn; > >> + reply = true; > >> + break; > >> + } > >> + } > >> + > >> + if (found) { > >> + if (conn_expired(found, now)) { > >> + found = NULL; > >> + } else { > >> + ctx->reply = reply; > >> + } > >> + } > >> + > >> + ctx->conn = found; > >> +} > >> + > >> +static enum ct_update_res > >> +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply, > >> + long long now) > >> +{ > >> + return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, > reply, > >> now); > >> +} > >> + > >> +static bool > >> +conn_expired(struct conn *conn, long long now) > >> +{ > >> + return now >= conn->expiration; > >> +} > >> + > >> +static bool > >> +valid_new(struct dp_packet *pkt, struct conn_key *key) > >> +{ > >> + return l4_protos[key->nw_proto]->valid_new(pkt); > >> +} > >> + > >> +static struct conn * > >> +new_conn(struct dp_packet *pkt, struct conn_key *key, long long > now) > >> +{ > >> + struct conn *newconn; > >> + > >> + newconn = l4_protos[key->nw_proto]->new_conn(pkt, now); > >> + > >> + if (newconn) { > >> + newconn->key = *key; > >> + } > >> + > >> + return newconn; > >> +} > >> + > >> +static void > >> +delete_conn(struct conn *conn) > >> +{ > >> + free(conn); > >> +} > >> diff --git a/lib/conntrack.h b/lib/conntrack.h > >> new file mode 100644 > >> index 0000000..8561273 > >> --- /dev/null > >> +++ b/lib/conntrack.h > >> @@ -0,0 +1,144 @@ > >> +/* > >> + * Copyright (c) 2015, 2016 Nicira, Inc. > >> + * > >> + * Licensed under the Apache License, Version 2.0 (the > "License"); > >> + * you may not use this file except in compliance with the > License. > >> + * You may obtain a copy of the License at: > >> + * > >> + * http://www.apache.org/licenses/LICENSE-2.0 > >> + * > >> + * Unless required by applicable law or agreed to in writing, > software > >> + * distributed under the License is distributed on an "AS IS" > BASIS, > >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express > or > >> implied. > >> + * See the License for the specific language governing > permissions and > >> + * limitations under the License. > >> + */ > >> + > >> +#ifndef CONNTRACK_H > >> +#define CONNTRACK_H 1 > >> + > >> +#include <stdbool.h> > >> + > >> +#include "hmap.h" > >> +#include "netdev-dpdk.h" > >> +#include "odp-netlink.h" > >> +#include "openvswitch/thread.h" > >> +#include "openvswitch/types.h" > >> + > >> + > >> +struct dp_packet; > >> + > >> +/* Userspace connection tracker > >> + * ============================ > >> + * > >> + * This is a connection tracking module that keeps all the state > in userspace. > >> + * > >> + * Usage > >> + * ===== > >> + * > >> + * struct conntract ct; > >> + * > >> + * Initialization: > >> + * > >> + * conntrack_init(&ct); > >> + * > >> + * It is necessary to periodically issue a call to > >> + * > >> + * conntrack_run(&ct); > >> + * > >> + * to allow the module to clean up expired connections. > >> + * > >> + * To send a group of packets through the connection tracker: > >> + * > >> + * conntrack_execute(&ct, pkts, n_pkts, ...); > >> + * > >> + * Thread-safety > >> + * ============= > >> + * > >> + * conntrack_execute() can be called by multiple threads > simultaneoulsy. > >> + */ > >> + > >> +struct conntrack; > >> + > >> +void conntrack_init(struct conntrack *); > >> +void conntrack_run(struct conntrack *); > >> +void conntrack_destroy(struct conntrack *); > >> + > >> +int conntrack_execute(struct conntrack *, struct dp_packet **, > size_t, > >> + bool commit, uint16_t zone, const uint32_t > *setmark, > >> + const struct ovs_key_ct_labels *setlabel, > >> + const char *helper); > >> + > > > >> +/* struct ct_lock is a standard mutex or a spinlock when using > DPDK */ > >> + > >> +#ifdef DPDK_NETDEV > >> +struct OVS_LOCKABLE ct_lock { > >> + rte_spinlock_t lock; > >> +}; > >> + > >> +static inline void ct_lock_init(struct ct_lock *lock) > >> +{ > >> + rte_spinlock_init(&lock->lock); > >> +} > >> + > >> +static inline void ct_lock_lock(struct ct_lock *lock) > >> + OVS_ACQUIRES(lock) > >> + OVS_NO_THREAD_SAFETY_ANALYSIS > >> +{ > >> + rte_spinlock_lock(&lock->lock); > >> +} > >> + > >> +static inline void ct_lock_unlock(struct ct_lock *lock) > >> + OVS_RELEASES(lock) > >> + OVS_NO_THREAD_SAFETY_ANALYSIS > >> +{ > >> + rte_spinlock_unlock(&lock->lock); > >> +} > >> + > >> +static inline void ct_lock_destroy(struct ct_lock *lock > OVS_UNUSED) > >> +{ > >> +} > >> +#else > >> +struct OVS_LOCKABLE ct_lock { > >> + struct ovs_mutex lock; > >> +}; > >> + > >> +static inline void ct_lock_init(struct ct_lock *lock) > >> +{ > >> + ovs_mutex_init(&lock->lock); > >> +} > >> + > >> +static inline void ct_lock_lock(struct ct_lock *lock) > >> + OVS_ACQUIRES(lock) > >> + OVS_NO_THREAD_SAFETY_ANALYSIS > >> +{ > >> + ovs_mutex_lock(&lock->lock); > >> +} > >> + > >> +static inline void ct_lock_unlock(struct ct_lock *lock) > >> + OVS_RELEASES(lock) > >> + OVS_NO_THREAD_SAFETY_ANALYSIS > >> +{ > >> + ovs_mutex_unlock(&lock->lock); > >> +} > >> + > >> +static inline void ct_lock_destroy(struct ct_lock *lock) > >> +{ > >> + ovs_mutex_destroy(&lock->lock); > >> +} > >> +#endif > >> + > > > >> +#define CONNTRACK_BUCKETS_SHIFT 8 > >> +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT) > >> + > >> +struct conntrack { > >> + /* Each lock guards a 'connections' bucket */ > >> + struct ct_lock locks[CONNTRACK_BUCKETS]; > >> + struct hmap connections[CONNTRACK_BUCKETS] OVS_GUARDED; > >> + uint32_t hash_basis; > >> + unsigned purge_bucket; > >> + uint32_t purge_inner_bucket; > >> + uint32_t purge_inner_offset; > >> +}; > >> + > >> +#endif /* conntrack.h */ > >> -- > >> 2.1.4 > >> > >> _______________________________________________ > >> dev mailing list > >> dev@openvswitch.org > >> http://openvswitch.org/mailman/listinfo/dev _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev