Thank you for your comments, Flavio!
On 26/04/2016 16:35, "Flavio Leitner" <f...@sysclose.org> wrote: >On Fri, Apr 15, 2016 at 05:02:36PM -0700, Daniele Di Proietto wrote: >> This commit adds the conntrack module. >> >> It is a connection tracker that resides entirely in userspace. Its >> primary user will be the dpif-netdev datapath. >> >> The module main goal is to provide conntrack_execute(), which offers a >> convenient interface to implement the datapath ct() action. >> >> The conntrack module uses two submodules to deal with the l4 protocol >> details (conntrack-other for UDP and ICMP, conntrack-tcp for TCP). >> >> The conntrack-tcp submodule implementation is adapted from FreeBSD's pf >> subsystem, therefore it's BSD licensed. It has been slightly altered to >> match the OVS coding style and to allow the pickup of already >> established connections. >> >> Signed-off-by: Daniele Di Proietto <diproiet...@vmware.com> >> --- >> COPYING | 1 + >> debian/copyright.in | 4 + >> lib/automake.mk | 5 + >> lib/conntrack-other.c | 91 ++++++ >> lib/conntrack-private.h | 77 +++++ >> lib/conntrack-tcp.c | 476 +++++++++++++++++++++++++++ >> lib/conntrack.c | 851 >> ++++++++++++++++++++++++++++++++++++++++++++++++ >> lib/conntrack.h | 144 ++++++++ >> 8 files changed, 1649 insertions(+) >> create mode 100644 lib/conntrack-other.c >> create mode 100644 lib/conntrack-private.h >> create mode 100644 lib/conntrack-tcp.c >> create mode 100644 lib/conntrack.c >> create mode 100644 lib/conntrack.h >> >> diff --git a/COPYING b/COPYING >> index 308e3ea..afb98b9 100644 >> --- a/COPYING >> +++ b/COPYING >> @@ -25,6 +25,7 @@ License, version 2. >> The following files are licensed under the 2-clause BSD license. >> include/windows/getopt.h >> lib/getopt_long.c >> + lib/conntrack-tcp.c >> >> The following files are licensed under the 3-clause BSD-license >> include/windows/netinet/icmp6.h >> diff --git a/debian/copyright.in b/debian/copyright.in >> index 57d007a..a15f4dd 100644 >> --- a/debian/copyright.in >> +++ b/debian/copyright.in >> @@ -21,6 +21,9 @@ Upstream Copyright Holders: >> Copyright (c) 2014 Michael Chapman >> Copyright (c) 2014 WindRiver, Inc. >> Copyright (c) 2014 Avaya, Inc. >> + Copyright (c) 2001 Daniel Hartmeier >> + Copyright (c) 2002 - 2008 Henning Brauer >> + Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org> >> >> License: >> >> @@ -90,6 +93,7 @@ License: >> lib/getopt_long.c >> include/windows/getopt.h >> datapath-windows/ovsext/Conntrack-tcp.c >> + lib/conntrack-tcp.c >> >> * The following files are licensed under the 3-clause BSD-license >> >> diff --git a/lib/automake.mk b/lib/automake.mk >> index 1ec2115..ba30442 100644 >> --- a/lib/automake.mk >> +++ b/lib/automake.mk >> @@ -47,6 +47,11 @@ lib_libopenvswitch_la_SOURCES = \ >> lib/compiler.h \ >> lib/connectivity.c \ >> lib/connectivity.h \ >> + lib/conntrack-private.h \ >> + lib/conntrack-tcp.c \ >> + lib/conntrack-other.c \ >> + lib/conntrack.c \ >> + lib/conntrack.h \ >> lib/coverage.c \ >> lib/coverage.h \ >> lib/crc32c.c \ >> diff --git a/lib/conntrack-other.c b/lib/conntrack-other.c >> new file mode 100644 >> index 0000000..65d02a9 >> --- /dev/null >> +++ b/lib/conntrack-other.c >> @@ -0,0 +1,91 @@ >> +/* >> + * Copyright (c) 2015, 2016 Nicira, Inc. >> + * >> + * Licensed under the Apache License, Version 2.0 (the "License"); >> + * you may not use this file except in compliance with the License. >> + * You may obtain a copy of the License at: >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +#include <config.h> >> + >> +#include "conntrack-private.h" >> +#include "dp-packet.h" >> + >> +enum other_state { >> + OTHERS_FIRST, >> + OTHERS_MULTIPLE, >> + OTHERS_BIDIR, >> +}; >> + >> +struct conn_other { >> + struct conn up; >> + enum other_state state; >> +}; >> + >> +static const long long other_timeouts[] = { >> + [OTHERS_FIRST] = 60 * 1000, >> + [OTHERS_MULTIPLE] = 60 * 1000, >> + [OTHERS_BIDIR] = 30 * 1000, >> +}; > >I missed a description here, like the unit. Right, I added a comment. > >> + >> +static struct conn_other * >> +conn_other_cast(const struct conn *conn) >> +{ >> + return CONTAINER_OF(conn, struct conn_other, up); >> +} >> + >> +static void >> +update_expiration(struct conn_other *conn, long long now) >> +{ >> + conn->up.expiration = now + other_timeouts[conn->state]; >> +} >> + >> +static enum ct_update_res >> +other_conn_update(struct conn *conn_, struct dp_packet *pkt OVS_UNUSED, >> + bool reply, long long now) >> +{ >> + struct conn_other *conn = conn_other_cast(conn_); >> + >> + if (reply && conn->state != OTHERS_BIDIR) { >> + conn->state = OTHERS_BIDIR; >> + } else if (conn->state == OTHERS_FIRST) { >> + conn->state = OTHERS_MULTIPLE; >> + } >> + >> + update_expiration(conn, now); >> + >> + return CT_UPDATE_VALID; >> +} >> + >> +static bool >> +other_valid_new(struct dp_packet *pkt OVS_UNUSED) >> +{ >> + return true; >> +} >> + >> +static struct conn * >> +other_new_conn(struct dp_packet *pkt OVS_UNUSED, long long now) >> +{ >> + struct conn_other *conn; >> + >> + conn = xzalloc(sizeof(struct conn_other)); >> + conn->state = OTHERS_FIRST; >> + >> + update_expiration(conn, now); >> + >> + return &conn->up; >> +} >> + >> +struct ct_l4_proto ct_proto_other = { >> + .new_conn = other_new_conn, >> + .valid_new = other_valid_new, >> + .conn_update = other_conn_update, >> +}; >> diff --git a/lib/conntrack-private.h b/lib/conntrack-private.h >> new file mode 100644 >> index 0000000..e668c44 >> --- /dev/null >> +++ b/lib/conntrack-private.h >> @@ -0,0 +1,77 @@ >> +/* >> + * Copyright (c) 2015, 2016 Nicira, Inc. >> + * >> + * Licensed under the Apache License, Version 2.0 (the "License"); >> + * you may not use this file except in compliance with the License. >> + * You may obtain a copy of the License at: >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +#ifndef CONNTRACK_PRIVATE_H >> +#define CONNTRACK_PRIVATE_H 1 >> + >> +#include <sys/types.h> >> +#include <netinet/in.h> >> +#include <netinet/ip6.h> >> + >> +#include "hmap.h" >> +#include "openvswitch/types.h" >> +#include "packets.h" >> +#include "unaligned.h" >> + >> +struct ct_addr { >> + union { >> + ovs_16aligned_be32 ipv4; >> + union ovs_16aligned_in6_addr ipv6; >> + ovs_be32 ipv4_aligned; >> + struct in6_addr ipv6_aligned; >> + }; >> +}; >> + >> +struct ct_endpoint { >> + struct ct_addr addr; >> + ovs_be16 port; >> +}; >> + >> +struct conn_key { >> + struct ct_endpoint src; >> + struct ct_endpoint dst; >> + >> + ovs_be16 dl_type; >> + uint8_t nw_proto; >> + uint16_t zone; >> +}; > >Based on the above I presume we consider all IPs in the same space >regardless of the vlan, correct? Yes, I think this is what the kernel connection tracker does. > > >> + >> +struct conn { >> + struct conn_key key; >> + struct conn_key rev_key; >> + long long expiration; >> + struct hmap_node node; >> + uint32_t mark; >> + ovs_u128 label; >> +}; >> + >> +enum ct_update_res { >> + CT_UPDATE_INVALID, >> + CT_UPDATE_VALID, >> + CT_UPDATE_NEW, >> +}; >> + >> +struct ct_l4_proto { >> + struct conn *(*new_conn)(struct dp_packet *pkt, long long now); >> + bool (*valid_new)(struct dp_packet *pkt); >> + enum ct_update_res (*conn_update)(struct conn *conn, struct dp_packet >> *pkt, >> + bool reply, long long now); >> +}; >> + >> +extern struct ct_l4_proto ct_proto_tcp; >> +extern struct ct_l4_proto ct_proto_other; >> + >> +#endif /* conntrack-private.h */ >> diff --git a/lib/conntrack-tcp.c b/lib/conntrack-tcp.c >> new file mode 100644 >> index 0000000..4d80038 >> --- /dev/null >> +++ b/lib/conntrack-tcp.c >> @@ -0,0 +1,476 @@ >> +/*- >> + * Copyright (c) 2001 Daniel Hartmeier >> + * Copyright (c) 2002 - 2008 Henning Brauer >> + * Copyright (c) 2012 Gleb Smirnoff <gleb...@freebsd.org> >> + * Copyright (c) 2015, 2016 Nicira, Inc. >> + * All rights reserved. >> + * >> + * Redistribution and use in source and binary forms, with or without >> + * modification, are permitted provided that the following conditions >> + * are met: >> + * >> + * - Redistributions of source code must retain the above copyright >> + * notice, this list of conditions and the following disclaimer. >> + * - Redistributions in binary form must reproduce the above >> + * copyright notice, this list of conditions and the following >> + * disclaimer in the documentation and/or other materials provided >> + * with the distribution. >> + * >> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS >> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT >> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS >> + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE >> + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, >> + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, >> + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; >> + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER >> + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT >> + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN >> + * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE >> + * POSSIBILITY OF SUCH DAMAGE. >> + * >> + * Effort sponsored in part by the Defense Advanced Research Projects >> + * Agency (DARPA) and Air Force Research Laboratory, Air Force >> + * Materiel Command, USAF, under agreement number F30602-01-2-0537. >> + * >> + * $OpenBSD: pf.c,v 1.634 2009/02/27 12:37:45 henning Exp $ >> + */ >> + >> +#include <config.h> >> + >> +#include "conntrack-private.h" >> +#include "ct-dpif.h" >> +#include "dp-packet.h" >> +#include "util.h" >> + >> +struct tcp_peer { >> + enum ct_dpif_tcp_state state; >> + uint32_t seqlo; /* Max sequence number sent >> */ >> + uint32_t seqhi; /* Max the other end ACKd + win >> */ >> + uint16_t max_win; /* largest window (pre scaling) >> */ >> + uint8_t wscale; /* window scaling factor >> */ >> +}; >> + >> +struct conn_tcp { >> + struct conn up; >> + struct tcp_peer peer[2]; >> +}; >> + >> +enum { >> + TCPOPT_EOL, >> + TCPOPT_NOP, >> + TCPOPT_WINDOW = 3, >> +}; >> + >> +/* TCP sequence numbers are 32 bit integers operated >> + * on with modular arithmetic. These macros can be >> + * used to compare such integers. */ >> +#define SEQ_LT(a,b) ((int)((a)-(b)) < 0) >> +#define SEQ_LEQ(a,b) ((int)((a)-(b)) <= 0) >> +#define SEQ_GT(a,b) ((int)((a)-(b)) > 0) >> +#define SEQ_GEQ(a,b) ((int)((a)-(b)) >= 0) >> + >> +#define SEQ_MIN(a, b) ((SEQ_LT(a, b)) ? (a) : (b)) >> +#define SEQ_MAX(a, b) ((SEQ_GT(a, b)) ? (a) : (b)) > >I am okay to leave those here, but since they are generic >operations, maybe we could define somewhere else and just >redefine here as SEQ_ operations. I am fine both ways. Do you have a suggestion for the name of the generic macro? > > >> + >> +static struct conn_tcp* >> +conn_tcp_cast(const struct conn* conn) >> +{ >> + return CONTAINER_OF(conn, struct conn_tcp, up); >> +} >> + >> +/* pf does this in in pf_normalize_tcp(), and it is called only if scrub >> + * is enabled. We're not scrubbing, but this check seems reasonable. */ >> +static bool >> +tcp_invalid_flags(uint16_t flags) >> +{ >> + >> + if (flags & TCP_SYN) { >> + if (flags & TCP_RST) { >> + return true; >> + } >> + if (flags & TCP_FIN) { >> + /* Here pf removes the fin flag. We simply mark the packet as >> + * invalid */ > >Maybe I missed something but it seems TH_FIN and TH_RST are >treated equally (drop). I'd vote to remove the comment. Agreed, branches merged! > >> + return true; >> + } >> + } else { >> + /* Illegal packet */ >> + if (!(flags & (TCP_ACK|TCP_RST))) { >> + return true; >> + } >> + } >> + >> + if (!(flags & TCP_ACK)) { >> + /* These flags are only valid if ACK is set */ >> + if ((flags & TCP_FIN) || (flags & TCP_PSH) || (flags & TCP_URG)) { >> + return true; >> + } >> + } >> + >> + return false; >> +} >> + >> +#define TCP_MAX_WSCALE 14 >> +#define CT_WSCALE_FLAG 0x80 >> +#define CT_WSCALE_UNKNOWN 0x40 >> +#define CT_WSCALE_MASK 0xf >> + >> +static uint8_t >> +tcp_get_wscale(const struct tcp_header *tcp) >> +{ >> + int len = TCP_OFFSET(tcp->tcp_ctl) * 4 - sizeof *tcp; >> + const uint8_t *opt = (const uint8_t *)(tcp + 1); >> + uint8_t wscale = 0; >> + uint8_t optlen; >> + >> + while (len >= 3) { >> + if (*opt == TCPOPT_EOL) { >> + break; >> + } >> + switch (*opt) { > >Maybe we could do: > > case TCPOPT_EOL: > break; I think The goal was to exit the while loop. I guess we could do case TCPOPT_EOL: return wscale; >> + case TCPOPT_NOP: >> + opt++; >> + len--; >> + break; >> + case TCPOPT_WINDOW: >> + wscale = MIN(opt[2], TCP_MAX_WSCALE); >> + wscale |= CT_WSCALE_FLAG; >> + /* fall through */ >> + default: >> + optlen = opt[1]; >> + if (optlen < 2) { >> + optlen = 2; >> + } >> + len -= optlen; >> + opt += optlen; >> + } >> + } >> + >> + return wscale; >> +} >> + >> +static uint32_t >> +tcp_payload_length(struct dp_packet *pkt) >> +{ >> + return (char *) dp_packet_tail(pkt) - dp_packet_l2_pad_size(pkt) >> + - (char *) dp_packet_get_tcp_payload(pkt); >> +} >> + >> +static void >> +update_expiration(struct conn_tcp *conn, long long now, long long interval) >> +{ >> + conn->up.expiration = now + interval; >> +} >> + >> + >> +static enum ct_update_res >> +tcp_conn_update(struct conn* conn_, struct dp_packet *pkt, bool reply, >> + long long now) >> +{ >> + struct conn_tcp *conn = conn_tcp_cast(conn_); >> + struct tcp_header *tcp = dp_packet_l4(pkt); >> + /* The peer that sent 'pkt' */ >> + struct tcp_peer *src = &conn->peer[reply ? 1 : 0]; >> + /* The peer that should receive 'pkt' */ >> + struct tcp_peer *dst = &conn->peer[reply ? 0 : 1]; >> + uint8_t sws = 0, dws = 0; >> + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); >> + >> + uint16_t win = ntohs(tcp->tcp_winsz); >> + uint32_t ack, end, seq, orig_seq; >> + uint32_t p_len = tcp_payload_length(pkt); >> + int ackskew; >> + >> + if (tcp_invalid_flags(tcp_flags)) { >> + return CT_UPDATE_INVALID; >> + } >> + >> + if (((tcp_flags & (TCP_SYN|TCP_ACK)) == TCP_SYN) && >> + dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 && >> + src->state >= CT_DPIF_TCPS_FIN_WAIT_2) { >> + src->state = dst->state = CT_DPIF_TCPS_CLOSED; >> + return CT_UPDATE_NEW; >> + } >> + >> + if (src->wscale & CT_WSCALE_FLAG >> + && dst->wscale & CT_WSCALE_FLAG >> + && !(tcp_flags & TCP_SYN)) { >> + >> + sws = src->wscale & CT_WSCALE_MASK; >> + dws = dst->wscale & CT_WSCALE_MASK; >> + >> + } else if (src->wscale & CT_WSCALE_UNKNOWN >> + && dst->wscale & CT_WSCALE_UNKNOWN >> + && !(tcp_flags & TCP_SYN)) { >> + >> + sws = TCP_MAX_WSCALE; >> + dws = TCP_MAX_WSCALE; >> + } >> + >> + /* >> + * Sequence tracking algorithm from Guido van Rooij's paper: >> + * http://www.madison-gurkha.com/publications/tcp_filtering/ >> + * tcp_filtering.ps >> + */ >> + >> + orig_seq = seq = ntohl(get_16aligned_be32(&tcp->tcp_seq)); >> + if (src->state < CT_DPIF_TCPS_SYN_SENT) { >> + /* First packet from this end. Set its state */ >> + >> + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack)); >> + >> + end = seq + p_len; >> + if (tcp_flags & TCP_SYN) { >> + end++; >> + if (dst->wscale & CT_WSCALE_FLAG) { >> + src->wscale = tcp_get_wscale(tcp); >> + if (src->wscale & CT_WSCALE_FLAG) { >> + /* Remove scale factor from initial window */ >> + sws = src->wscale & CT_WSCALE_MASK; >> + win = DIV_ROUND_UP((uint32_t) win, 1 << sws); >> + dws = dst->wscale & CT_WSCALE_MASK; >> + } else { >> + /* fixup other window */ >> + dst->max_win <<= dst->wscale & >> + CT_WSCALE_MASK; >> + /* in case of a retrans SYN|ACK */ >> + dst->wscale = 0; >> + } >> + } >> + } >> + if (tcp_flags & TCP_FIN) { >> + end++; >> + } >> + >> + src->seqlo = seq; >> + src->state = CT_DPIF_TCPS_SYN_SENT; >> + /* >> + * May need to slide the window (seqhi may have been set by >> + * the crappy stack check or if we picked up the connection >> + * after establishment) >> + */ >> + if (src->seqhi == 1 || >> + SEQ_GEQ(end + MAX(1, dst->max_win << dws), src->seqhi)) { >> + src->seqhi = end + MAX(1, dst->max_win << dws); >> + } >> + if (win > src->max_win) { >> + src->max_win = win; >> + } >> + >> + } else { >> + ack = ntohl(get_16aligned_be32(&tcp->tcp_ack)); >> + end = seq + p_len; >> + if (tcp_flags & TCP_SYN) { >> + end++; >> + } >> + if (tcp_flags & TCP_FIN) { >> + end++; >> + } >> + } >> + >> + if ((tcp_flags & TCP_ACK) == 0) { >> + /* Let it pass through the ack skew check */ >> + ack = dst->seqlo; >> + } else if ((ack == 0 >> + && (tcp_flags & (TCP_ACK|TCP_RST)) == (TCP_ACK|TCP_RST)) >> + /* broken tcp stacks do not set ack */) { >> + /* Many stacks (ours included) will set the ACK number in an >> + * FIN|ACK if the SYN times out -- no sequence to ACK. */ >> + ack = dst->seqlo; >> + } >> + >> + if (seq == end) { >> + /* Ease sequencing restrictions on no data packets */ >> + seq = src->seqlo; >> + end = seq; >> + } >> + >> + ackskew = dst->seqlo - ack; >> +#define MAXACKWINDOW (0xffff + 1500) /* 1500 is an arbitrary fudge >> factor */ >> + if (SEQ_GEQ(src->seqhi, end) >> + /* Last octet inside other's window space */ >> + && SEQ_GEQ(seq, src->seqlo - (dst->max_win << dws)) >> + /* Retrans: not more than one window back */ >> + && (ackskew >= -MAXACKWINDOW) >> + /* Acking not more than one reassembled fragment backwards */ >> + && (ackskew <= (MAXACKWINDOW << sws)) >> + /* Acking not more than one window forward */ >> + && ((tcp_flags & TCP_RST) == 0 || orig_seq == src->seqlo >> + || (orig_seq == src->seqlo + 1) || (orig_seq + 1 == >> src->seqlo))) { >> + /* Require an exact/+1 sequence match on resets when possible */ >> + >> + /* update max window */ >> + if (src->max_win < win) { >> + src->max_win = win; >> + } >> + /* synchronize sequencing */ >> + if (SEQ_GT(end, src->seqlo)) { >> + src->seqlo = end; >> + } >> + /* slide the window of what the other end can send */ >> + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { >> + dst->seqhi = ack + MAX((win << sws), 1); >> + } >> + >> + /* update states */ >> + if (tcp_flags & TCP_SYN && src->state < CT_DPIF_TCPS_SYN_SENT) { >> + src->state = CT_DPIF_TCPS_SYN_SENT; >> + } >> + if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) { >> + src->state = CT_DPIF_TCPS_CLOSING; >> + } >> + if (tcp_flags & TCP_ACK) { >> + if (dst->state == CT_DPIF_TCPS_SYN_SENT) { >> + dst->state = CT_DPIF_TCPS_ESTABLISHED; >> + } else if (dst->state == CT_DPIF_TCPS_CLOSING) { >> + dst->state = CT_DPIF_TCPS_FIN_WAIT_2; >> + } >> + } >> + if (tcp_flags & TCP_RST) { >> + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; >> + } >> + >> + if (src->state >= CT_DPIF_TCPS_FIN_WAIT_2 >> + && dst->state >= CT_DPIF_TCPS_FIN_WAIT_2) { >> + update_expiration(conn, now, 30 * 1000); >> + } else if (src->state >= CT_DPIF_TCPS_CLOSING >> + && dst->state >= CT_DPIF_TCPS_CLOSING) { >> + update_expiration(conn, now, 45 * 1000); >> + } else if (src->state < CT_DPIF_TCPS_ESTABLISHED >> + || dst->state < CT_DPIF_TCPS_ESTABLISHED) { >> + update_expiration(conn, now, 30 * 1000); >> + } else if (src->state >= CT_DPIF_TCPS_CLOSING >> + || dst->state >= CT_DPIF_TCPS_CLOSING) { >> + update_expiration(conn, now, 15 * 60 * 1000); >> + } else { >> + update_expiration(conn, now, 24 * 60 * 60 * 1000); >> + } > >It would be nice to have defines for those timeout values. You're right, I've added some defines for these. > > >> + } else if ((dst->state < CT_DPIF_TCPS_SYN_SENT >> + || dst->state >= CT_DPIF_TCPS_FIN_WAIT_2 >> + || src->state >= CT_DPIF_TCPS_FIN_WAIT_2) >> + && SEQ_GEQ(src->seqhi + MAXACKWINDOW, end) >> + /* Within a window forward of the originating packet */ >> + && SEQ_GEQ(seq, src->seqlo - MAXACKWINDOW)) { >> + /* Within a window backward of the originating packet */ >> + >> + /* >> + * This currently handles three situations: >> + * 1) Stupid stacks will shotgun SYNs before their peer >> + * replies. >> + * 2) When PF catches an already established stream (the >> + * firewall rebooted, the state table was flushed, routes >> + * changed...) >> + * 3) Packets get funky immediately after the connection >> + * closes (this should catch Solaris spurious ACK|FINs >> + * that web servers like to spew after a close) >> + * >> + * This must be a little more careful than the above code >> + * since packet floods will also be caught here. We don't >> + * update the TTL here to mitigate the damage of a packet >> + * flood and so the same code can handle awkward establishment >> + * and a loosened connection close. >> + * In the establishment case, a correct peer response will >> + * validate the connection, go through the normal state code >> + * and keep updating the state TTL. >> + */ >> + >> + /* update max window */ >> + if (src->max_win < win) { >> + src->max_win = win; >> + } >> + /* synchronize sequencing */ >> + if (SEQ_GT(end, src->seqlo)) { >> + src->seqlo = end; >> + } >> + /* slide the window of what the other end can send */ >> + if (SEQ_GEQ(ack + (win << sws), dst->seqhi)) { >> + dst->seqhi = ack + MAX((win << sws), 1); >> + } >> + >> + /* >> + * Cannot set dst->seqhi here since this could be a shotgunned >> + * SYN and not an already established connection. >> + */ >> + >> + if (tcp_flags & TCP_FIN && src->state < CT_DPIF_TCPS_CLOSING) { >> + src->state = CT_DPIF_TCPS_CLOSING; >> + } >> + >> + if (tcp_flags & TCP_RST) { >> + src->state = dst->state = CT_DPIF_TCPS_TIME_WAIT; >> + } >> + } else { >> + return CT_UPDATE_INVALID; >> + } >> + >> + return CT_UPDATE_VALID; >> +} >> + >> +static bool >> +tcp_valid_new(struct dp_packet *pkt) >> +{ >> + struct tcp_header *tcp = dp_packet_l4(pkt); >> + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); >> + >> + if (tcp_invalid_flags(tcp_flags)) { >> + return false; >> + } >> + >> + /* A syn+ack is not allowed to create a connection. We want to allow >> + * totally new connections (syn) or already established, not partially >> + * open (syn+ack). */ >> + if ((tcp_flags & TCP_SYN) && (tcp_flags & TCP_ACK)) { >> + return false; >> + } >> + >> + return true; >> +} >> + >> +static struct conn * >> +tcp_new_conn(struct dp_packet *pkt, long long now) >> +{ >> + struct conn_tcp* newconn = NULL; >> + struct tcp_header *tcp = dp_packet_l4(pkt); >> + struct tcp_peer *src, *dst; >> + uint16_t tcp_flags = TCP_FLAGS(tcp->tcp_ctl); >> + >> + newconn = xzalloc(sizeof(struct conn_tcp)); >> + >> + src = &newconn->peer[0]; >> + dst = &newconn->peer[1]; >> + >> + src->seqlo = ntohl(get_16aligned_be32(&tcp->tcp_seq)); >> + src->seqhi = src->seqlo + tcp_payload_length(pkt) + 1; >> + >> + if (tcp_flags & TCP_SYN) { >> + src->seqhi++; >> + src->wscale = tcp_get_wscale(tcp); >> + } else { >> + src->wscale = CT_WSCALE_UNKNOWN; >> + dst->wscale = CT_WSCALE_UNKNOWN; >> + } >> + src->max_win = MAX(ntohs(tcp->tcp_winsz), 1); >> + if (src->wscale & CT_WSCALE_MASK) { >> + /* Remove scale factor from initial window */ >> + uint8_t sws = src->wscale & CT_WSCALE_MASK; >> + src->max_win = DIV_ROUND_UP((uint32_t) src->max_win, 1 << sws); >> + } >> + if (tcp_flags & TCP_FIN) { >> + src->seqhi++; >> + } >> + dst->seqhi = 1; >> + dst->max_win = 1; >> + src->state = CT_DPIF_TCPS_SYN_SENT; >> + dst->state = CT_DPIF_TCPS_CLOSED; >> + >> + update_expiration(newconn, now, 30 * 1000); >> + >> + return &newconn->up; >> +} >> + >> +struct ct_l4_proto ct_proto_tcp = { >> + .new_conn = tcp_new_conn, >> + .valid_new = tcp_valid_new, >> + .conn_update = tcp_conn_update, >> +}; >> diff --git a/lib/conntrack.c b/lib/conntrack.c >> new file mode 100644 >> index 0000000..840335b >> --- /dev/null >> +++ b/lib/conntrack.c >> @@ -0,0 +1,851 @@ >> +/* >> + * Copyright (c) 2015, 2016 Nicira, Inc. >> + * >> + * Licensed under the Apache License, Version 2.0 (the "License"); >> + * you may not use this file except in compliance with the License. >> + * You may obtain a copy of the License at: >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +#include <config.h> >> +#include "conntrack.h" >> + >> +#include <errno.h> >> +#include <sys/types.h> >> +#include <netinet/in.h> >> +#include <netinet/icmp6.h> >> + >> +#include "bitmap.h" >> +#include "conntrack-private.h" >> +#include "dp-packet.h" >> +#include "flow.h" >> +#include "hmap.h" >> +#include "netdev.h" >> +#include "odp-netlink.h" >> +#include "openvswitch/vlog.h" >> +#include "ovs-rcu.h" >> +#include "random.h" >> +#include "timeval.h" >> + >> +VLOG_DEFINE_THIS_MODULE(conntrack); >> + >> +struct conn_lookup_ctx { >> + struct conn_key key; >> + struct conn *conn; >> + uint32_t hash; >> + bool reply; >> + bool related; >> +}; >> + >> +static bool conn_key_extract(struct conntrack *, struct dp_packet *, >> + struct conn_lookup_ctx *, uint16_t zone); >> +static uint32_t conn_key_hash(const struct conn_key *, uint32_t basis); >> +static void conn_key_reverse(struct conn_key *); >> +static void conn_key_lookup(struct conntrack *ct, >> + struct conn_lookup_ctx *ctx, >> + unsigned bucket, >> + long long now); >> +static bool valid_new(struct dp_packet *pkt, struct conn_key *); >> +static struct conn *new_conn(struct dp_packet *pkt, struct conn_key *, >> + long long now); >> +static void delete_conn(struct conn *); >> +static enum ct_update_res conn_update(struct conn *, struct dp_packet*, >> + bool reply, long long now); >> +static bool conn_expired(struct conn *, long long now); >> +static void set_mark(struct dp_packet *, struct conn *, >> + uint32_t val, uint32_t mask); >> +static void set_label(struct dp_packet *, struct conn *, >> + const struct ovs_key_ct_labels *val, >> + const struct ovs_key_ct_labels *mask); >> + >> +static struct ct_l4_proto *l4_protos[] = { >> + [IPPROTO_TCP] = &ct_proto_tcp, >> + [IPPROTO_UDP] = &ct_proto_other, >> + [IPPROTO_ICMP] = &ct_proto_other, >> +}; >> + >> +/* Initializes the connection tracker 'ct'. The caller is responbile for >> + * calling 'conntrack_destroy()', when the instance is not needed anymore */ >> +void >> +conntrack_init(struct conntrack *ct) >> +{ >> + unsigned i; >> + >> + for (i = 0; i < CONNTRACK_BUCKETS; i++) { >> + ct_lock_init(&ct->locks[i]); >> + ct_lock_lock(&ct->locks[i]); >> + hmap_init(&ct->connections[i]); >> + ct_lock_unlock(&ct->locks[i]); >> + } >> + ct->hash_basis = random_uint32(); >> + ct->purge_bucket = 0; >> + ct->purge_inner_bucket = 0; >> + ct->purge_inner_offset = 0; >> +} >> + >> +/* Destroys the connection tracker 'ct' and frees all the allocated memory. >> */ >> +void >> +conntrack_destroy(struct conntrack *ct) >> +{ >> + unsigned i; >> + >> + for (i = 0; i < CONNTRACK_BUCKETS; i++) { >> + struct conn *conn, *next; >> + >> + ct_lock_lock(&ct->locks[i]); >> + HMAP_FOR_EACH_SAFE(conn, next, node, &ct->connections[i]) { >> + hmap_remove(&ct->connections[i], &conn->node); >> + delete_conn(conn); >> + } >> + hmap_destroy(&ct->connections[i]); >> + ct_lock_unlock(&ct->locks[i]); >> + ct_lock_destroy(&ct->locks[i]); >> + } >> +} >> +? >> +static unsigned hash_to_bucket(uint32_t hash) >> +{ >> + /* Extracts the most significant bits in hash. The least significant >> bits >> + * are already used internally by the hmap implementation. */ >> + BUILD_ASSERT(CONNTRACK_BUCKETS_SHIFT < 32 && CONNTRACK_BUCKETS_SHIFT >= >> 1); >> + >> + return (hash >> (32 - CONNTRACK_BUCKETS_SHIFT)) % CONNTRACK_BUCKETS; >> +} >> + >> +static void >> +write_ct_md(struct dp_packet *pkt, uint8_t state, uint16_t zone, >> + uint32_t mark, ovs_u128 label) >> +{ >> + pkt->md.ct_state = state | CS_TRACKED; >> + pkt->md.ct_zone = zone; >> + pkt->md.ct_mark = mark; >> + pkt->md.ct_label = label; >> +} >> + >> +static struct conn * >> +conn_not_found(struct conntrack *ct, struct dp_packet *pkt, >> + struct conn_lookup_ctx *ctx, uint8_t *state, bool commit, >> + long long now) >> +{ >> + unsigned bucket = hash_to_bucket(ctx->hash); >> + struct conn *nc = NULL; >> + >> + if (!valid_new(pkt, &ctx->key)) { >> + *state |= CS_INVALID; >> + return nc; >> + } >> + >> + *state |= CS_NEW; >> + >> + if (commit) { >> + nc = new_conn(pkt, &ctx->key, now); >> + >> + memcpy(&nc->rev_key, &ctx->key, sizeof nc->rev_key); >> + >> + conn_key_reverse(&nc->rev_key); >> + hmap_insert(&ct->connections[bucket], &nc->node, ctx->hash); >> + } >> + >> + return nc; >> +} >> + >> +static struct conn * >> +process_one(struct conntrack *ct, struct dp_packet *pkt, >> + struct conn_lookup_ctx *ctx, uint16_t zone, >> + bool commit, long long now) >> +{ >> + unsigned bucket = hash_to_bucket(ctx->hash); >> + struct conn *conn = ctx->conn; >> + uint8_t state = 0; >> + >> + if (conn) { >> + if (ctx->related) { >> + state |= CS_RELATED; >> + if (ctx->reply) { >> + state |= CS_REPLY_DIR; >> + } >> + } else { >> + enum ct_update_res res; >> + >> + res = conn_update(conn, pkt, ctx->reply, now); >> + >> + switch (res) { >> + case CT_UPDATE_VALID: >> + state |= CS_ESTABLISHED; >> + if (ctx->reply) { >> + state |= CS_REPLY_DIR; >> + } >> + break; >> + case CT_UPDATE_INVALID: >> + state |= CS_INVALID; >> + break; >> + case CT_UPDATE_NEW: >> + hmap_remove(&ct->connections[bucket], &conn->node); >> + delete_conn(conn); >> + conn = conn_not_found(ct, pkt, ctx, &state, commit, now); >> + break; >> + } >> + } >> + >> + pkt->md.ct_label = conn->label; >> + pkt->md.ct_mark = conn->mark; >> + write_ct_md(pkt, state, zone, conn->mark, conn->label); >> + } else { >> + conn = conn_not_found(ct, pkt, ctx, &state, commit, now); >> + write_ct_md(pkt, state, zone, 0, (ovs_u128) {{0}}); >> + } >> + >> + return conn; >> +} >> + >> +/* Sends a group of 'cnt' packets ('pkts') through the connection tracker >> + * 'ct'. If 'commit' is true, the packets are allowed to create new entries >> + * in the connection tables. 'setmark', if not NULL, should point to a two >> + * elements array containing a value and a mask to set the connection mark. >> + * 'setlabel' behaves similarly for the connection label.*/ >> +int >> +conntrack_execute(struct conntrack *ct, struct dp_packet **pkts, size_t cnt, >> + bool commit, uint16_t zone, const uint32_t *setmark, >> + const struct ovs_key_ct_labels *setlabel, >> + const char *helper) >> +{ >> +#if !defined(__CHECKER__) && !defined(_WIN32) >> + const size_t KEY_ARRAY_SIZE = cnt; >> +#else >> + enum { KEY_ARRAY_SIZE = NETDEV_MAX_BURST }; >> +#endif >> + struct conn_lookup_ctx ctxs[KEY_ARRAY_SIZE]; >> + int8_t bucket_list[CONNTRACK_BUCKETS]; >> + struct { >> + unsigned bucket; >> + unsigned long maps; >> + } arr[KEY_ARRAY_SIZE]; >> + long long now = time_msec(); >> + size_t i = 0; >> + uint8_t arrcnt = 0; >> + >> + BUILD_ASSERT_DECL(sizeof arr[0].maps * CHAR_BIT >= NETDEV_MAX_BURST); >> + >> + if (helper) { >> + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 5); >> + >> + VLOG_WARN_RL(&rl, "ALG helper \"%s\" not supported", helper); >> + /* Continue without the helper */ >> + } >> + >> + memset(bucket_list, INT8_C(-1), sizeof bucket_list); >> + for (i = 0; i < cnt; i++) { >> + unsigned bucket; >> + >> + if (!conn_key_extract(ct, pkts[i], &ctxs[i], zone)) { >> + write_ct_md(pkts[i], CS_INVALID, zone, 0, (ovs_u128){{0}}); >> + continue; >> + } >> + >> + bucket = hash_to_bucket(ctxs[i].hash); >> + if (bucket_list[bucket] == INT8_C(-1)) { >> + bucket_list[bucket] = arrcnt; >> + >> + arr[arrcnt].maps = 0; >> + ULLONG_SET1(arr[arrcnt].maps, i); >> + arr[arrcnt++].bucket = bucket; >> + } else { >> + ULLONG_SET1(arr[bucket_list[bucket]].maps, i); >> + arr[bucket_list[bucket]].maps |= 1UL << i; >> + } >> + } >> + >> + for (i = 0; i < arrcnt; i++) { >> + size_t j; >> + >> + ct_lock_lock(&ct->locks[arr[i].bucket]); >> + >> + ULLONG_FOR_EACH_1(j, arr[i].maps) { >> + struct conn *conn; >> + >> + conn_key_lookup(ct, &ctxs[j], arr[i].bucket, now); >> + >> + conn = process_one(ct, pkts[j], &ctxs[j], zone, commit, now); >> + >> + if (conn && setmark) { >> + set_mark(pkts[j], conn, setmark[0], setmark[1]); >> + } >> + >> + if (conn && setlabel) { >> + set_label(pkts[j], conn, &setlabel[0], &setlabel[1]); >> + } >> + } >> + ct_lock_unlock(&ct->locks[arr[i].bucket]); >> + } >> + >> + return 0; >> +} >> + >> +static void >> +set_mark(struct dp_packet *pkt, struct conn *conn, uint32_t val, uint32_t >> mask) >> +{ >> + pkt->md.ct_mark = val | (pkt->md.ct_mark & ~(mask)); >> + conn->mark = pkt->md.ct_mark; >> +} >> + >> +static void >> +set_label(struct dp_packet *pkt, struct conn *conn, >> + const struct ovs_key_ct_labels *val, >> + const struct ovs_key_ct_labels *mask) >> +{ >> + ovs_u128 v, m; >> + >> + memcpy(&v, val, sizeof v); >> + memcpy(&m, mask, sizeof m); >> + >> + pkt->md.ct_label.u64.lo = v.u64.lo >> + | (pkt->md.ct_label.u64.lo & ~(m.u64.lo)); >> + pkt->md.ct_label.u64.hi = v.u64.hi >> + | (pkt->md.ct_label.u64.hi & ~(m.u64.hi)); >> + conn->label = pkt->md.ct_label; >> +} >> +? >> +#define CONNTRACK_PURGE_NUM 256 >> + >> +static void >> +sweep_bucket(struct hmap *bucket, uint32_t *inner_bucket, >> + uint32_t *inner_offset, unsigned *left, long long now) >> +{ >> + while (*left != 0) { >> + struct hmap_node *node; >> + struct conn *conn; >> + >> + node = hmap_at_position(bucket, inner_bucket, inner_offset); >> + >> + if (!node) { >> + hmap_shrink(bucket); >> + break; >> + } >> + >> + INIT_CONTAINER(conn, node, node); >> + if (conn_expired(conn, now)) { >> + hmap_remove(bucket, &conn->node); >> + delete_conn(conn); >> + (*left)--; >> + } >> + } >> +} >> + >> +/* Cleans up old connection entries. Should be called periodically. */ >> +void >> +conntrack_run(struct conntrack *ct) >> +{ >> + unsigned bucket = hash_to_bucket(ct->purge_bucket); >> + uint32_t inner_bucket = ct->purge_inner_bucket, >> + inner_offset = ct->purge_inner_offset; >> + unsigned left = CONNTRACK_PURGE_NUM; >> + long long now = time_msec(); >> + >> + while (bucket < CONNTRACK_BUCKETS) { >> + ct_lock_lock(&ct->locks[bucket]); >> + sweep_bucket(&ct->connections[bucket], >> + &inner_bucket, &inner_offset, >> + &left, now); >> + ct_lock_unlock(&ct->locks[bucket]); >> + >> + if (left == 0) { >> + break; >> + } else { >> + bucket++; >> + } >> + } >> + >> + ct->purge_bucket = bucket; >> + ct->purge_inner_bucket = inner_bucket; >> + ct->purge_inner_offset = inner_offset; >> +} >> +? >> +/* Key extraction */ >> + >> +/* The function stores a pointer to the first byte after the header in >> + * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is >> + * not interested in the header's tail, meaning that the header has >> + * already been parsed (e.g. by flow_extract): we take this as a hint to >> + * save a few checks. */ >> +static inline bool >> +extract_l3_ipv4(struct conn_key *key, const void *data, size_t size, >> + const char **new_data) >> +{ >> + const struct ip_header *ip = data; >> + >> + if (new_data) { >> + size_t ip_len; >> + >> + if (OVS_UNLIKELY(size < IP_HEADER_LEN)) { >> + return false; >> + } >> + ip_len = IP_IHL(ip->ip_ihl_ver) * 4; >> + >> + if (OVS_UNLIKELY(ip_len < IP_HEADER_LEN)) { >> + return false; >> + } >> + if (OVS_UNLIKELY(size < ip_len)) { >> + return false; >> + } >> + >> + *new_data = (char *) data + ip_len; >> + } >> + >> + if (IP_IS_FRAGMENT(ip->ip_frag_off)) { >> + return false; >> + } >> + >> + key->src.addr.ipv4 = ip->ip_src; >> + key->dst.addr.ipv4 = ip->ip_dst; >> + key->nw_proto = ip->ip_proto; >> + >> + return true; >> +} >> + >> +/* The function stores a pointer to the first byte after the header in >> + * '*new_data', if 'new_data' is not NULL. If it is NULL, the caller is >> + * not interested in the header's tail, meaning that the header has >> + * already been parsed (e.g. by flow_extract): we take this as a hint to >> + * save a few checks. */ >> +static inline bool >> +extract_l3_ipv6(struct conn_key *key, const void *data, size_t size, >> + const char **new_data) >> +{ >> + const struct ovs_16aligned_ip6_hdr *ip6 = data; >> + uint8_t nw_proto = ip6->ip6_nxt; >> + uint8_t nw_frag = 0; >> + >> + if (new_data) { >> + if (OVS_UNLIKELY(size < sizeof *ip6)) { >> + return false; >> + } >> + } >> + >> + data = ip6 + 1; >> + size -= sizeof *ip6; >> + >> + if (!parse_ipv6_ext_hdrs(&data, &size, &nw_proto, &nw_frag)) { >> + return false; >> + } >> + >> + if (new_data) { >> + *new_data = data; >> + } >> + >> + if (nw_frag) { >> + return false; >> + } >> + >> + key->src.addr.ipv6 = ip6->ip6_src; >> + key->dst.addr.ipv6 = ip6->ip6_dst; >> + key->nw_proto = nw_proto; >> + >> + return true; >> +} >> + >> +static inline bool >> +check_l4_tcp(const void *data, size_t size) >> +{ >> + const struct tcp_header *tcp = data; >> + size_t tcp_len = TCP_OFFSET(tcp->tcp_ctl) * 4; >> + >> + if (OVS_LIKELY(tcp_len >= TCP_HEADER_LEN && tcp_len <= size)) { >> + return true; >> + } >> + >> + return false; >> +} >> + >> +static inline bool >> +extract_l4_tcp(struct conn_key *key, const void *data, size_t size) >> +{ >> + const struct tcp_header *tcp = data; >> + >> + if (OVS_UNLIKELY(size < TCP_HEADER_LEN)) { >> + return false; >> + } >> + >> + key->src.port = tcp->tcp_src; >> + key->dst.port = tcp->tcp_dst; >> + >> + return true; >> +} >> + >> +static inline bool >> +extract_l4_udp(struct conn_key *key, const void *data, size_t size) >> +{ >> + const struct udp_header *udp = data; >> + >> + if (OVS_UNLIKELY(size < UDP_HEADER_LEN)) { >> + return false; >> + } >> + >> + key->src.port = udp->udp_src; >> + key->dst.port = udp->udp_dst; >> + >> + return true; >> +} >> + >> +static inline bool extract_l4(struct conn_key *key, const void *data, >> + size_t size, bool *related); >> + >> +/* If 'related' is not NULL and the function is processing an ICMP >> + * error packet, extract the l3 and l4 fields from the nested header >> + * instead and set *related to true. If 'related' is NULL we're >> + * already processing a nested header and no such recursion is >> + * possible */ >> +static inline int >> +extract_l4_icmp(struct conn_key *key, const void *data, size_t size, >> + bool *related) >> +{ >> + const struct icmp_header *icmp = data; >> + >> + if (OVS_UNLIKELY(size < ICMP_HEADER_LEN)) { >> + return false; >> + } >> + >> + switch (icmp->icmp_type) { >> + case ICMP4_ECHO_REQUEST: >> + case ICMP4_ECHO_REPLY: >> + case ICMP4_TIMESTAMP: >> + case ICMP4_TIMESTAMPREPLY: >> + case ICMP4_INFOREQUEST: >> + case ICMP4_INFOREPLY: >> + /* Separate ICMP connection: identified using id */ >> + key->src.port = key->dst.port = icmp->icmp_fields.echo.id; >> + break; >> + case ICMP4_DST_UNREACH: >> + case ICMP4_TIME_EXCEEDED: >> + case ICMP4_PARAM_PROB: >> + case ICMP4_SOURCEQUENCH: >> + case ICMP4_REDIRECT: { >> + /* ICMP packet part of another connection. We should >> + * extract the key from embedded packet header */ >> + struct conn_key inner_key; >> + const char *l3 = (const char *) (icmp + 1); >> + const char *tail = (const char *) data + size; >> + const char *l4; >> + bool ok; >> + >> + if (!related) { >> + return false; >> + } >> + *related = true; >> + >> + memset(&inner_key, 0, sizeof inner_key); >> + inner_key.dl_type = htons(ETH_TYPE_IP); >> + ok = extract_l3_ipv4(&inner_key, l3, tail - l3, &l4); >> + if (!ok) { >> + return false; >> + } >> + >> + /* pf doesn't do this, but it seems a good idea */ >> + if (inner_key.src.addr.ipv4_aligned != key->dst.addr.ipv4_aligned >> + || inner_key.dst.addr.ipv4_aligned != >> key->src.addr.ipv4_aligned) { >> + return false; >> + } >> + >> + key->src = inner_key.src; >> + key->dst = inner_key.dst; >> + key->nw_proto = inner_key.nw_proto; >> + >> + ok = extract_l4(key, l4, tail - l4, NULL); >> + if (ok) { >> + conn_key_reverse(key); >> + } >> + return ok; >> + } >> + default: >> + return false; >> + } >> + >> + return true; >> +} >> + >> +/* If 'related' is not NULL and the function is processing an ICMP >> + * error packet, extract the l3 and l4 fields from the nested header >> + * instead and set *related to true. If 'related' is NULL we're >> + * already processing a nested header and no such recursion is >> + * possible */ >> +static inline bool >> +extract_l4_icmp6(struct conn_key *key, const void *data, size_t size, >> + bool *related) >> +{ >> + const struct icmp6_header *icmp6 = data; >> + >> + /* All the messages that we support need at least 4 bytes after >> + * the header */ >> + if (size < sizeof *icmp6 + 4) { >> + return false; >> + } >> + >> + switch (icmp6->icmp6_type) { >> + case ICMP6_ECHO_REQUEST: >> + case ICMP6_ECHO_REPLY: >> + /* Separate ICMP connection: identified using id */ >> + key->src.port = key->dst.port = *(ovs_be16 *) (icmp6 + 1); >> + break; >> + case ICMP6_DST_UNREACH: >> + case ICMP6_PACKET_TOO_BIG: >> + case ICMP6_TIME_EXCEEDED: >> + case ICMP6_PARAM_PROB:{ >> + /* ICMP packet part of another connection. We should >> + * extract the key from embedded packet header */ >> + struct conn_key inner_key; >> + const char *l3 = (const char *) icmp6 + 8; >> + const char *tail = (const char *) data + size; >> + const char *l4 = NULL; >> + bool ok; >> + >> + if (!related) { >> + return false; >> + } >> + *related = true; >> + >> + memset(&inner_key, 0, sizeof inner_key); >> + inner_key.dl_type = htons(ETH_TYPE_IPV6); >> + ok = extract_l3_ipv6(&inner_key, l3, tail - l3, &l4); >> + if (!ok) { >> + return false; >> + } >> + >> + /* pf doesn't do this, but it seems a good idea */ >> + if (!ipv6_addr_equals(&inner_key.src.addr.ipv6_aligned, >> + &key->dst.addr.ipv6_aligned) >> + || !ipv6_addr_equals(&inner_key.dst.addr.ipv6_aligned, >> + &key->src.addr.ipv6_aligned)) { >> + return false; >> + } >> + >> + key->src = inner_key.src; >> + key->dst = inner_key.dst; >> + key->nw_proto = inner_key.nw_proto; >> + >> + ok = extract_l4(key, l4, tail - l4, NULL); >> + if (ok) { >> + conn_key_reverse(key); >> + } >> + return ok; >> + } >> + default: >> + return false; >> + } >> + >> + return true; >> +} >> + >> +/* Extract l4 fields into 'key', which must already contain valid l3 >> + * members. If 'related' is not NULL and an ICMP error packet is being >> + * processed, the function will extract the key from the packet nested >> + * in the ICMP paylod and set '*related' to true. If 'related' is NULL, >> + * nested parsing isn't allowed. This is necessary to limit the >> + * recursion level. */ >> +static inline bool >> +extract_l4(struct conn_key *key, const void *data, size_t size, bool >> *related) >> +{ >> + if (key->nw_proto == IPPROTO_TCP) { >> + return extract_l4_tcp(key, data, size) >> + && (!related || check_l4_tcp(data, size)); >> + } else if (key->nw_proto == IPPROTO_UDP) { >> + return extract_l4_udp(key, data, size); >> + } else if (key->dl_type == htons(ETH_TYPE_IP) >> + && key->nw_proto == IPPROTO_ICMP) { >> + return extract_l4_icmp(key, data, size, related); >> + } else if (key->dl_type == htons(ETH_TYPE_IPV6) >> + && key->nw_proto == IPPROTO_ICMPV6) { >> + return extract_l4_icmp6(key, data, size, related); >> + } else { >> + return false; >> + } >> +} >> + >> +static bool >> +conn_key_extract(struct conntrack *ct, struct dp_packet *pkt, >> + struct conn_lookup_ctx *ctx, uint16_t zone) >> +{ >> + const struct eth_header *l2 = dp_packet_l2(pkt); >> + const struct ip_header *l3 = dp_packet_l3(pkt); >> + const char *l4 = dp_packet_l4(pkt); >> + const char *tail = dp_packet_tail(pkt); >> + bool ok; >> + >> + memset(ctx, 0, sizeof *ctx); >> + >> + if (!l2 || !l3 || !l4) { >> + return false; >> + } >> + >> + ctx->key.zone = zone; >> + >> + /* XXX In this function we parse the packet (again, it has already >> + * gone through miniflow_extract()) for two reasons: >> + * >> + * 1) To extract the l3 addresses and l4 ports. >> + * We already have the l3 and l4 headers' pointers. Extracting >> + * the l3 addresses and the l4 ports is really cheap, since they >> + * can be found at fixed locations. >> + * 2) To extract the l3 and l4 types. >> + * Extracting the l3 and l4 types (especially the l3[1]) on the >> + * other hand is quite expensive, because they're not at a >> + * fixed location. >> + * >> + * Here's a way to avoid (2) with the help of the datapath. >> + * The datapath doesn't keep the packet's extracted flow[2], so >> + * using that is not an option. We could use the packet's matching >> + * megaflow, but we have to make sure that the l3 and l4 types >> + * are unwildcarded. This means either: >> + * >> + * a) dpif-netdev unwildcards the l3 (and l4) types when a new flow >> + * is installed if the actions contains ct(). This is what the >> + * kernel datapath does. It is not so straightforward, though. >> + * >> + * b) ofproto-dpif-xlate unwildcards the l3 (and l4) types when >> + * translating a ct() action. This is already done in different >> + * actions and since both the userspace and the kernel datapath >> + * would benefit from it, it seems an appropriate place to do >> + * it. >> + * >> + * --- >> + * [1] A simple benchmark (running only the connection tracker >> + * over and over on the same packets) shows that if the >> + * l3 type is already provided we are 15% faster (running the >> + * connection tracker over a couple of DPDK devices with a >> + * stream of UDP 64-bytes packets shows that we are 4% faster). >> + * >> + * [2] The reasons for this are that keeping the flow increases >> + * (slightly) the cache footprint and increases computation >> + * time as we move the packet around. Most importantly, the flow >> + * should be updated by the actions and this can be slow, as >> + * we use a sparse representation (miniflow). >> + * >> + */ >> + ctx->key.dl_type = parse_dl_type(l2, (char *) l3 - (char *) l2); >> + if (ctx->key.dl_type == htons(ETH_TYPE_IP)) { >> + ok = extract_l3_ipv4(&ctx->key, l3, tail - (char *) l3, NULL); >> + } else if (ctx->key.dl_type == htons(ETH_TYPE_IPV6)) { >> + ok = extract_l3_ipv6(&ctx->key, l3, tail - (char *) l3, NULL); >> + } else { >> + ok = false; >> + } >> + >> + if (ok) { >> + if (extract_l4(&ctx->key, l4, tail - l4, &ctx->related)) { >> + ctx->hash = conn_key_hash(&ctx->key, ct->hash_basis); >> + return true; >> + } >> + } >> + >> + return false; >> +} >> +? >> +/* Symmetric */ >> +static uint32_t >> +conn_key_hash(const struct conn_key *key, uint32_t basis) >> +{ >> + uint32_t hsrc, hdst, hash; >> + int i; >> + >> + hsrc = hdst = basis; >> + >> + for (i = 0; i < sizeof(key->src) / sizeof(uint32_t); i++) { >> + hsrc = hash_add(hsrc, ((uint32_t *) &key->src)[i]); >> + hdst = hash_add(hdst, ((uint32_t *) &key->dst)[i]); >> + } >> + >> + hash = hsrc ^ hdst; >> + >> + hash = hash_words((uint32_t *) &key->dst + 1, >> + (uint32_t *) (key + 1) - (uint32_t *) (&key->dst + 1), >> + hash); >> + >> + return hash; >> +} >> + >> +static void >> +conn_key_reverse(struct conn_key *key) >> +{ >> + struct ct_endpoint tmp; >> + tmp = key->src; >> + key->src = key->dst; >> + key->dst = tmp; >> +} >> + >> +static void >> +conn_key_lookup(struct conntrack *ct, >> + struct conn_lookup_ctx *ctx, >> + unsigned bucket, >> + long long now) >> +{ >> + struct conn *conn, *found = NULL; >> + uint32_t hash = ctx->hash; >> + bool reply; >> + >> + HMAP_FOR_EACH_WITH_HASH (conn, node, hash, &ct->connections[bucket]) { >> + if (!memcmp(&conn->key, &ctx->key, sizeof(conn->key))) { >> + found = conn; >> + reply = false; >> + break; >> + } >> + if (!memcmp(&conn->rev_key, &ctx->key, sizeof(conn->rev_key))) { >> + found = conn; >> + reply = true; >> + break; >> + } >> + } >> + >> + if (found) { >> + if (conn_expired(found, now)) { >> + found = NULL; >> + } else { >> + ctx->reply = reply; >> + } >> + } >> + >> + ctx->conn = found; >> +} >> + >> +static enum ct_update_res >> +conn_update(struct conn *conn, struct dp_packet *pkt, bool reply, >> + long long now) >> +{ >> + return l4_protos[conn->key.nw_proto]->conn_update(conn, pkt, reply, >> now); >> +} >> + >> +static bool >> +conn_expired(struct conn *conn, long long now) >> +{ >> + return now >= conn->expiration; >> +} >> + >> +static bool >> +valid_new(struct dp_packet *pkt, struct conn_key *key) >> +{ >> + return l4_protos[key->nw_proto]->valid_new(pkt); >> +} >> + >> +static struct conn * >> +new_conn(struct dp_packet *pkt, struct conn_key *key, long long now) >> +{ >> + struct conn *newconn; >> + >> + newconn = l4_protos[key->nw_proto]->new_conn(pkt, now); >> + >> + if (newconn) { >> + newconn->key = *key; >> + } >> + >> + return newconn; >> +} >> + >> +static void >> +delete_conn(struct conn *conn) >> +{ >> + free(conn); >> +} >> diff --git a/lib/conntrack.h b/lib/conntrack.h >> new file mode 100644 >> index 0000000..8561273 >> --- /dev/null >> +++ b/lib/conntrack.h >> @@ -0,0 +1,144 @@ >> +/* >> + * Copyright (c) 2015, 2016 Nicira, Inc. >> + * >> + * Licensed under the Apache License, Version 2.0 (the "License"); >> + * you may not use this file except in compliance with the License. >> + * You may obtain a copy of the License at: >> + * >> + * http://www.apache.org/licenses/LICENSE-2.0 >> + * >> + * Unless required by applicable law or agreed to in writing, software >> + * distributed under the License is distributed on an "AS IS" BASIS, >> + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. >> + * See the License for the specific language governing permissions and >> + * limitations under the License. >> + */ >> + >> +#ifndef CONNTRACK_H >> +#define CONNTRACK_H 1 >> + >> +#include <stdbool.h> >> + >> +#include "hmap.h" >> +#include "netdev-dpdk.h" >> +#include "odp-netlink.h" >> +#include "openvswitch/thread.h" >> +#include "openvswitch/types.h" >> + >> + >> +struct dp_packet; >> + >> +/* Userspace connection tracker >> + * ============================ >> + * >> + * This is a connection tracking module that keeps all the state in >> userspace. >> + * >> + * Usage >> + * ===== >> + * >> + * struct conntract ct; >> + * >> + * Initialization: >> + * >> + * conntrack_init(&ct); >> + * >> + * It is necessary to periodically issue a call to >> + * >> + * conntrack_run(&ct); >> + * >> + * to allow the module to clean up expired connections. >> + * >> + * To send a group of packets through the connection tracker: >> + * >> + * conntrack_execute(&ct, pkts, n_pkts, ...); >> + * >> + * Thread-safety >> + * ============= >> + * >> + * conntrack_execute() can be called by multiple threads simultaneoulsy. >> + */ >> + >> +struct conntrack; >> + >> +void conntrack_init(struct conntrack *); >> +void conntrack_run(struct conntrack *); >> +void conntrack_destroy(struct conntrack *); >> + >> +int conntrack_execute(struct conntrack *, struct dp_packet **, size_t, >> + bool commit, uint16_t zone, const uint32_t *setmark, >> + const struct ovs_key_ct_labels *setlabel, >> + const char *helper); >> +? >> +/* struct ct_lock is a standard mutex or a spinlock when using DPDK */ >> + >> +#ifdef DPDK_NETDEV >> +struct OVS_LOCKABLE ct_lock { >> + rte_spinlock_t lock; >> +}; >> + >> +static inline void ct_lock_init(struct ct_lock *lock) >> +{ >> + rte_spinlock_init(&lock->lock); >> +} >> + >> +static inline void ct_lock_lock(struct ct_lock *lock) > >There is an extra space above at the end. Removed, thanks > >> + OVS_ACQUIRES(lock) >> + OVS_NO_THREAD_SAFETY_ANALYSIS >> +{ >> + rte_spinlock_lock(&lock->lock); >> +} >> + >> +static inline void ct_lock_unlock(struct ct_lock *lock) >> + OVS_RELEASES(lock) >> + OVS_NO_THREAD_SAFETY_ANALYSIS >> +{ >> + rte_spinlock_unlock(&lock->lock); >> +} >> + >> +static inline void ct_lock_destroy(struct ct_lock *lock OVS_UNUSED) >> +{ >> +} >> +#else >> +struct OVS_LOCKABLE ct_lock { >> + struct ovs_mutex lock; >> +}; >> + >> +static inline void ct_lock_init(struct ct_lock *lock) >> +{ >> + ovs_mutex_init(&lock->lock); >> +} >> + >> +static inline void ct_lock_lock(struct ct_lock *lock) >> + OVS_ACQUIRES(lock) >> + OVS_NO_THREAD_SAFETY_ANALYSIS >> +{ >> + ovs_mutex_lock(&lock->lock); >> +} >> + >> +static inline void ct_lock_unlock(struct ct_lock *lock) >> + OVS_RELEASES(lock) >> + OVS_NO_THREAD_SAFETY_ANALYSIS >> +{ >> + ovs_mutex_unlock(&lock->lock); >> +} >> + >> +static inline void ct_lock_destroy(struct ct_lock *lock) >> +{ >> + ovs_mutex_destroy(&lock->lock); >> +} >> +#endif >> +? >> +#define CONNTRACK_BUCKETS_SHIFT 8 >> +#define CONNTRACK_BUCKETS (1 << CONNTRACK_BUCKETS_SHIFT) >> + >> +struct conntrack { >> + /* Each lock guards a 'connections' bucket */ >> + struct ct_lock locks[CONNTRACK_BUCKETS]; >> + struct hmap connections[CONNTRACK_BUCKETS] OVS_GUARDED; >> + uint32_t hash_basis; >> + unsigned purge_bucket; >> + uint32_t purge_inner_bucket; >> + uint32_t purge_inner_offset; >> +}; >> + >> +#endif /* conntrack.h */ >> -- >> 2.1.4 >> >> _______________________________________________ >> dev mailing list >> dev@openvswitch.org >> http://openvswitch.org/mailman/listinfo/dev > >-- >fbl _______________________________________________ dev mailing list dev@openvswitch.org http://openvswitch.org/mailman/listinfo/dev