Author: kmacy
Date: Fri Mar 12 05:03:26 2010
New Revision: 205066
URL: http://svn.freebsd.org/changeset/base/205066

Log:
  - restructure flowtable to support ipv6
  - add a name argument to flowtable_alloc for printing with ddb commands
  - extend ddb commands to print destination address or 4-tuples
  - don't parse ports in ulp header if FL_HASH_ALL is not passed
  - add kern_flowtable_insert to enable more generic use of flowtable
    (e.g. system calls for adding entries)
  - don't hash loopback addresses
  - cleanup whitespace
  - keep statistics per-cpu for per-cpu flowtables to avoid cache line 
contention
  - add sysctls to accumulate stats and report aggregate
  
  MFC after:    7 days

Modified:
  head/sys/net/flowtable.c
  head/sys/net/flowtable.h
  head/sys/net/if_llatbl.c
  head/sys/net/if_llatbl.h
  head/sys/netinet/ip_input.c
  head/sys/netinet/ip_output.c

Modified: head/sys/net/flowtable.c
==============================================================================
--- head/sys/net/flowtable.c    Fri Mar 12 04:44:20 2010        (r205065)
+++ head/sys/net/flowtable.c    Fri Mar 12 05:03:26 2010        (r205066)
@@ -1,6 +1,6 @@
 /**************************************************************************
 
-Copyright (c) 2008-2009, BitGravity Inc.
+Copyright (c) 2008-2010, BitGravity Inc.
 All rights reserved.
 
 Redistribution and use in source and binary forms, with or without
@@ -30,6 +30,8 @@ POSSIBILITY OF SUCH DAMAGE.
 #include "opt_route.h"
 #include "opt_mpath.h"
 #include "opt_ddb.h"
+#include "opt_inet.h"
+#include "opt_inet6.h"
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
@@ -45,6 +47,7 @@ __FBSDID("$FreeBSD$");
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/proc.h>
+#include <sys/sbuf.h>
 #include <sys/sched.h>
 #include <sys/smp.h>
 #include <sys/socket.h>
@@ -63,6 +66,9 @@ __FBSDID("$FreeBSD$");
 #include <netinet/in_var.h>
 #include <netinet/if_ether.h>
 #include <netinet/ip.h>
+#ifdef INET6
+#include <netinet/ip6.h>
+#endif
 #include <netinet/tcp.h>
 #include <netinet/udp.h>
 #include <netinet/sctp.h>
@@ -140,31 +146,39 @@ union flentryp {
        struct flentry          **pcpu[MAXCPU];
 };
 
+struct flowtable_stats {
+       uint64_t        ft_collisions;
+       uint64_t        ft_allocated;
+       uint64_t        ft_misses;
+       uint64_t        ft_max_depth;
+       uint64_t        ft_free_checks;
+       uint64_t        ft_frees;
+       uint64_t        ft_hits;
+       uint64_t        ft_lookups;
+} __aligned(128);
+
 struct flowtable {
+       struct  flowtable_stats ft_stats[MAXCPU];
        int             ft_size;
        int             ft_lock_count;
        uint32_t        ft_flags;
-       uint32_t        ft_collisions;
-       uint32_t        ft_allocated;
-       uint32_t        ft_misses;
-       uint64_t        ft_hits;
 
        uint32_t        ft_udp_idle;
        uint32_t        ft_fin_wait_idle;
        uint32_t        ft_syn_idle;
        uint32_t        ft_tcp_idle;
 
+       char            *ft_name;
        fl_lock_t       *ft_lock;
        fl_lock_t       *ft_unlock;
        fl_rtalloc_t    *ft_rtalloc;
        struct mtx      *ft_locks;
 
-       
        union flentryp  ft_table;
        bitstr_t        *ft_masks[MAXCPU];
        bitstr_t        *ft_tmpmask;
        struct flowtable *ft_next;
-};
+} __aligned(128);
 
 static struct proc *flowcleanerproc;
 static VNET_DEFINE(struct flowtable *, flow_list_head);
@@ -181,12 +195,24 @@ static struct cv  flowclean_cv;
 static struct mtx      flowclean_lock;
 static uint32_t                flowclean_cycles;
 
+#ifdef FLOWTABLE_DEBUG
+#define FLDPRINTF(ft, flags, fmt, ...)                 \
+do {                                           \
+       if ((ft)->ft_flags & (flags))           \
+               printf((fmt), __VA_ARGS__);     \
+} while (0);                                   \
+
+#else
+#define FLDPRINTF(ft, flags, fmt, ...)
+
+#endif
+
+
 /*
  * TODO:
  * - Make flowtable stats per-cpu, aggregated at sysctl call time,
  *   to avoid extra cache evictions caused by incrementing a shared
  *   counter
- * - add IPv6 support to flow lookup
  * - add sysctls to resize && flush flow tables 
  * - Add per flowtable sysctls for statistics and configuring timeouts
  * - add saturation counter to rtentry to support per-packet load-balancing
@@ -200,13 +226,6 @@ static uint32_t            flowclean_cycles;
  */
 VNET_DEFINE(int, flowtable_enable) = 1;
 static VNET_DEFINE(int, flowtable_debug);
-static VNET_DEFINE(int, flowtable_hits);
-static VNET_DEFINE(int, flowtable_lookups);
-static VNET_DEFINE(int, flowtable_misses);
-static VNET_DEFINE(int, flowtable_frees);
-static VNET_DEFINE(int, flowtable_free_checks);
-static VNET_DEFINE(int, flowtable_max_depth);
-static VNET_DEFINE(int, flowtable_collisions);
 static VNET_DEFINE(int, flowtable_syn_expire) = SYN_IDLE;
 static VNET_DEFINE(int, flowtable_udp_expire) = UDP_IDLE;
 static VNET_DEFINE(int, flowtable_fin_wait_expire) = FIN_WAIT_IDLE;
@@ -216,13 +235,6 @@ static VNET_DEFINE(int, flowtable_ready)
 
 #define        V_flowtable_enable              VNET(flowtable_enable)
 #define        V_flowtable_debug               VNET(flowtable_debug)
-#define        V_flowtable_hits                VNET(flowtable_hits)
-#define        V_flowtable_lookups             VNET(flowtable_lookups)
-#define        V_flowtable_misses              VNET(flowtable_misses)
-#define        V_flowtable_frees               VNET(flowtable_frees)
-#define        V_flowtable_free_checks         VNET(flowtable_free_checks)
-#define        V_flowtable_max_depth           VNET(flowtable_max_depth)
-#define        V_flowtable_collisions          VNET(flowtable_collisions)
 #define        V_flowtable_syn_expire          VNET(flowtable_syn_expire)
 #define        V_flowtable_udp_expire          VNET(flowtable_udp_expire)
 #define        V_flowtable_fin_wait_expire     VNET(flowtable_fin_wait_expire)
@@ -235,20 +247,6 @@ SYSCTL_VNET_INT(_net_inet_flowtable, OID
     &VNET_NAME(flowtable_debug), 0, "print debug info.");
 SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, enable, CTLFLAG_RW,
     &VNET_NAME(flowtable_enable), 0, "enable flowtable caching.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, hits, CTLFLAG_RD,
-    &VNET_NAME(flowtable_hits), 0, "# flowtable hits.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, lookups, CTLFLAG_RD,
-    &VNET_NAME(flowtable_lookups), 0, "# flowtable lookups.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, misses, CTLFLAG_RD,
-    &VNET_NAME(flowtable_misses), 0, "#flowtable misses.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, frees, CTLFLAG_RD,
-    &VNET_NAME(flowtable_frees), 0, "#flows freed.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, free_checks, CTLFLAG_RD,
-    &VNET_NAME(flowtable_free_checks), 0, "#flows free checks.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, max_depth, CTLFLAG_RD,
-    &VNET_NAME(flowtable_max_depth), 0, "max collision list length.");
-SYSCTL_VNET_INT(_net_inet_flowtable, OID_AUTO, collisions, CTLFLAG_RD,
-    &VNET_NAME(flowtable_collisions), 0, "#flowtable collisions.");
 
 /*
  * XXX This does not end up updating timeouts at runtime
@@ -298,6 +296,77 @@ SYSCTL_VNET_PROC(_net_inet_flowtable, OI
     CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_nmbflows, "IU",
     "Maximum number of flows allowed");
 
+
+
+#define FS_PRINT(sb, field)    sbuf_printf((sb), "\t%s=%jd", #field, 
fs->ft_##field)
+
+static void
+fs_print(struct flowtable_stats *fs)
+{
+       struct sbuf *sb;
+
+       sb = sbuf_new(NULL, NULL, 32*1024, SBUF_FIXEDLEN);
+
+       FS_PRINT(sb, collisions);
+       FS_PRINT(sb, allocated);
+       FS_PRINT(sb, misses);
+       FS_PRINT(sb, max_depth);
+       FS_PRINT(sb, free_checks);
+       FS_PRINT(sb, frees);
+       FS_PRINT(sb, hits);
+       FS_PRINT(sb, lookups);
+       sbuf_finish(sb);
+       
+}
+
+static void
+flowtable_show_stats(struct flowtable *ft)
+{
+       int i;
+       struct flowtable_stats fs, *pfs;
+
+       if (ft->ft_flags & FL_PCPU) {
+               bzero(&fs, sizeof(fs));
+               pfs = &fs;
+               for (i = 0; i <= mp_maxid; i++) {
+                       if (CPU_ABSENT(i))
+                               continue;
+                       pfs->ft_collisions  += ft->ft_stats[i].ft_collisions;
+                       pfs->ft_allocated   += ft->ft_stats[i].ft_allocated;
+                       pfs->ft_misses      += ft->ft_stats[i].ft_misses;
+                       pfs->ft_free_checks += ft->ft_stats[i].ft_free_checks;
+                       pfs->ft_frees       += ft->ft_stats[i].ft_frees;
+                       pfs->ft_hits        += ft->ft_stats[i].ft_hits;
+                       pfs->ft_lookups     += ft->ft_stats[i].ft_lookups;
+                       if (ft->ft_stats[i].ft_max_depth > pfs->ft_max_depth)
+                               pfs->ft_max_depth = 
ft->ft_stats[i].ft_max_depth;
+               }
+       } else {
+               pfs = &ft->ft_stats[0];
+       }
+
+       fs_print(pfs);
+}
+
+static int
+sysctl_flowtable_stats(SYSCTL_HANDLER_ARGS)
+{
+       struct flowtable *ft;
+
+       ft = V_flow_list_head;
+       while (ft != NULL) {
+               printf("name: %s\n", ft->ft_name);
+               flowtable_show_stats(ft);
+               ft = ft->ft_next;
+       }
+
+       return (0);
+}
+SYSCTL_VNET_PROC(_net_inet_flowtable, OID_AUTO, stats,
+    CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_flowtable_stats, "IU",
+    "flowtable statistics");
+
+
 #ifndef RADIX_MPATH
 static void
 in_rtalloc_ign_wrapper(struct route *ro, uint32_t hash, u_int fibnum)
@@ -342,52 +411,122 @@ flowtable_pcpu_unlock(struct flowtable *
 #define FL_ENTRY_LOCK(table, hash)  (table)->ft_lock((table), (hash))
 #define FL_ENTRY_UNLOCK(table, hash) (table)->ft_unlock((table), (hash))
 
-#define FL_STALE (1<<8)
-#define FL_IPV6  (1<<9)
+#define FL_STALE       (1<<8)
+#define FL_IPV6        (1<<9)
+#define FL_OVERWRITE   (1<<10)
 
-static uint32_t
-ipv4_flow_lookup_hash_internal(struct mbuf *m, struct route *ro,
-    uint32_t *key, uint16_t *flags, uint8_t *protop)
+void
+flow_invalidate(struct flentry *fle)
 {
-       uint16_t sport = 0, dport = 0;
-       struct ip *ip = NULL;
-       uint8_t proto = 0;
+
+       fle->f_flags |= FL_STALE;
+}
+
+static __inline int
+proto_to_flags(uint8_t proto)
+{
+       int flag;
+
+       switch (proto) {
+       case IPPROTO_TCP:
+               flag = FL_TCP;
+               break;
+       case IPPROTO_SCTP:
+               flag = FL_SCTP;
+               break;          
+       case IPPROTO_UDP:
+               flag = FL_UDP;
+               break;
+       default:
+               flag = 0;
+               break;
+       }
+
+       return (flag);
+}
+
+static __inline int
+flags_to_proto(int flags)
+{
+       int proto, protoflags;
+
+       protoflags = flags & (FL_TCP|FL_SCTP|FL_UDP);
+       switch (protoflags) {
+       case FL_TCP:
+               proto = IPPROTO_TCP;
+               break;
+       case FL_SCTP:
+               proto = IPPROTO_SCTP;
+               break;
+       case FL_UDP:
+               proto = IPPROTO_UDP;
+               break;
+       default:
+               proto = 0;
+               break;
+       }
+       return (proto);
+}
+
+#ifdef INET
+#ifdef FLOWTABLE_DEBUG
+static void
+ipv4_flow_print_tuple(int flags, int proto, struct sockaddr_in *ssin,
+    struct sockaddr_in *dsin)
+{
+       char saddr[4*sizeof "123"], daddr[4*sizeof "123"];
+
+       if (flags & FL_HASH_ALL) {
+               inet_ntoa_r(ssin->sin_addr, saddr);
+               inet_ntoa_r(dsin->sin_addr, daddr);
+               printf("proto=%d %s:%d->%s:%d\n",
+                   proto, saddr, ntohs(ssin->sin_port), daddr,
+                   ntohs(dsin->sin_port));
+       } else {
+               inet_ntoa_r(*(struct in_addr *) &dsin->sin_addr, daddr);
+               printf("proto=%d %s\n", proto, daddr);
+       }
+
+}
+#endif
+
+static int
+ipv4_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
+    struct sockaddr_in *ssin, struct sockaddr_in *dsin, uint16_t *flags)
+{
+       struct ip *ip;
+       uint8_t proto;
        int iphlen;
-       uint32_t hash;
-       struct sockaddr_in *sin;
        struct tcphdr *th;
        struct udphdr *uh;
        struct sctphdr *sh;
+       uint16_t sport, dport;
 
-       if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
-               return (0);
+       proto = sport = dport = 0;
+       ip = mtod(m, struct ip *);
+       dsin->sin_family = AF_INET;
+       dsin->sin_len = sizeof(*dsin);
+       dsin->sin_addr = ip->ip_dst;
+       ssin->sin_family = AF_INET;
+       ssin->sin_len = sizeof(*ssin);
+       ssin->sin_addr = ip->ip_src;    
 
-       key[1] = key[0] = 0;
-       sin = (struct sockaddr_in *)&ro->ro_dst;
-       if (m != NULL) {
-               ip = mtod(m, struct ip *);
-               sin->sin_family = AF_INET;
-               sin->sin_len = sizeof(*sin);
-               sin->sin_addr = ip->ip_dst;
-       } else
-               *flags &= ~FL_HASH_PORTS;
-
-       key[2] = sin->sin_addr.s_addr;
-
-       if ((*flags & FL_HASH_PORTS) == 0)
+       proto = ip->ip_p;
+       if ((*flags & FL_HASH_ALL) == 0) {
+               FLDPRINTF(ft, FL_DEBUG_ALL, "skip port check flags=0x%x ",
+                   *flags);
                goto skipports;
+       }
 
-       proto = ip->ip_p;
        iphlen = ip->ip_hl << 2; /* XXX options? */
-       key[1] = ip->ip_src.s_addr;
-       
+
        switch (proto) {
        case IPPROTO_TCP:
                th = (struct tcphdr *)((caddr_t)ip + iphlen);
-               sport = ntohs(th->th_sport);
-               dport = ntohs(th->th_dport);
-               *flags |= th->th_flags;
-               if (*flags & TH_RST)
+               sport = th->th_sport;
+               dport = th->th_dport;
+               if ((*flags & FL_HASH_ALL) &&
+                   (th->th_flags & (TH_RST|TH_FIN)))
                        *flags |= FL_STALE;
        break;
        case IPPROTO_UDP:
@@ -401,38 +540,288 @@ ipv4_flow_lookup_hash_internal(struct mb
                dport = sh->dest_port;
        break;
        default:
-               if (*flags & FL_HASH_PORTS)
-                       goto noop;
+               FLDPRINTF(ft, FL_DEBUG_ALL, "proto=0x%x not supported\n", 
proto);
+               return (ENOTSUP);
                /* no port - hence not a protocol we care about */
                break;
        
        }
-       *protop = proto;
 
-       /*
-        * If this is a transmit route cache then 
-        * hash all flows to a given destination to
-        * the same bucket
-        */
-       if ((*flags & FL_HASH_PORTS) == 0)
-               proto = sport = dport = 0;
+skipports:
+       *flags |= proto_to_flags(proto);
+       ssin->sin_port = sport;
+       dsin->sin_port = dport;
+       return (0);
+}
+
+static uint32_t
+ipv4_flow_lookup_hash_internal(
+       struct sockaddr_in *ssin, struct sockaddr_in *dsin, 
+           uint32_t *key, uint16_t flags)
+{
+       uint16_t sport, dport;
+       uint8_t proto;
+       int offset = 0;
+
+       if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
+               return (0);
+       proto = flags_to_proto(flags);
+       sport = dport = key[2] = key[1] = key[0] = 0;
+       if ((ssin != NULL) && (flags & FL_HASH_ALL)) {
+               key[1] = ssin->sin_addr.s_addr;
+               sport = ssin->sin_port;
+       }
+       if (dsin != NULL) {
+               key[2] = dsin->sin_addr.s_addr;
+               dport = dsin->sin_port;
+       }
+       if (flags & FL_HASH_ALL) {
+               ((uint16_t *)key)[0] = sport;
+               ((uint16_t *)key)[1] = dport; 
+       } else
+               offset = V_flow_hashjitter + proto;
 
-       ((uint16_t *)key)[0] = sport;
-       ((uint16_t *)key)[1] = dport; 
+       return (jenkins_hashword(key, 3, offset));
+}
 
-skipports:
-       hash = jenkins_hashword(key, 3, V_flow_hashjitter + proto);
-       if (m != NULL && (m->m_flags & M_FLOWID) == 0) {
-               m->m_flags |= M_FLOWID;
-               m->m_pkthdr.flowid = hash;
+static struct flentry *
+flowtable_lookup_mbuf4(struct flowtable *ft, struct mbuf *m)
+{
+       struct sockaddr_storage ssa, dsa;
+       uint16_t flags;
+       struct sockaddr_in *dsin, *ssin;
+
+       dsin = (struct sockaddr_in *)&dsa;
+       ssin = (struct sockaddr_in *)&ssa;
+       flags = ft->ft_flags;
+       if (ipv4_mbuf_demarshal(ft, m, ssin, dsin, &flags) != 0)
+               return (NULL);
+
+       return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
+}
+
+void
+flow_to_route(struct flentry *fle, struct route *ro)
+{
+       uint32_t *hashkey = NULL;
+       struct sockaddr_in *sin;
+
+       sin = (struct sockaddr_in *)&ro->ro_dst;
+       sin->sin_family = AF_INET;
+       sin->sin_len = sizeof(*sin);
+       hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
+       sin->sin_addr.s_addr = hashkey[2];
+       ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
+       ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
+}
+#endif /* INET */
+
+#ifdef INET6
+/*
+ * PULLUP_TO(len, p, T) makes sure that len + sizeof(T) is contiguous,
+ * then it sets p to point at the offset "len" in the mbuf. WARNING: the
+ * pointer might become stale after other pullups (but we never use it
+ * this way).
+ */
+#define PULLUP_TO(_len, p, T)                                          \
+do {                                                                   \
+       int x = (_len) + sizeof(T);                                     \
+       if ((m)->m_len < x) {                                           \
+               goto receive_failed;                                    \
+       }                                                               \
+       p = (mtod(m, char *) + (_len));                                 \
+} while (0)
+
+#define        TCP(p)          ((struct tcphdr *)(p))
+#define        SCTP(p)         ((struct sctphdr *)(p))
+#define        UDP(p)          ((struct udphdr *)(p))
+
+static int
+ipv6_mbuf_demarshal(struct flowtable *ft, struct mbuf *m,
+    struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, uint16_t *flags)
+{
+       struct ip6_hdr *ip6;
+       uint8_t proto;
+       int hlen;
+       uint16_t src_port, dst_port;
+       u_short offset;
+       void *ulp;
+
+       offset = hlen = src_port = dst_port = 0;
+       ulp = NULL;
+       ip6 = mtod(m, struct ip6_hdr *);
+       hlen = sizeof(struct ip6_hdr);
+       proto = ip6->ip6_nxt;
+
+       if ((*flags & FL_HASH_ALL) == 0)
+               goto skipports;
+
+       while (ulp == NULL) {
+               switch (proto) {
+               case IPPROTO_ICMPV6:
+               case IPPROTO_OSPFIGP:
+               case IPPROTO_PIM:
+               case IPPROTO_CARP:
+               case IPPROTO_ESP:
+               case IPPROTO_NONE:
+                       ulp = ip6;
+                       break;
+               case IPPROTO_TCP:
+                       PULLUP_TO(hlen, ulp, struct tcphdr);
+                       dst_port = TCP(ulp)->th_dport;
+                       src_port = TCP(ulp)->th_sport;
+                       if ((*flags & FL_HASH_ALL) &&
+                           (TCP(ulp)->th_flags & (TH_RST|TH_FIN)))
+                               *flags |= FL_STALE;
+                       break;
+               case IPPROTO_SCTP:
+                       PULLUP_TO(hlen, ulp, struct sctphdr);
+                       src_port = SCTP(ulp)->src_port;
+                       dst_port = SCTP(ulp)->dest_port;
+                       break;
+               case IPPROTO_UDP:
+                       PULLUP_TO(hlen, ulp, struct udphdr);
+                       dst_port = UDP(ulp)->uh_dport;
+                       src_port = UDP(ulp)->uh_sport;
+                       break;
+               case IPPROTO_HOPOPTS:   /* RFC 2460 */
+                       PULLUP_TO(hlen, ulp, struct ip6_hbh);
+                       hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+                       proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+                       ulp = NULL;
+                       break;
+               case IPPROTO_ROUTING:   /* RFC 2460 */
+                       PULLUP_TO(hlen, ulp, struct ip6_rthdr); 
+                       hlen += (((struct ip6_rthdr *)ulp)->ip6r_len + 1) << 3;
+                       proto = ((struct ip6_rthdr *)ulp)->ip6r_nxt;
+                       ulp = NULL;
+                       break;
+               case IPPROTO_FRAGMENT:  /* RFC 2460 */
+                       PULLUP_TO(hlen, ulp, struct ip6_frag);
+                       hlen += sizeof (struct ip6_frag);
+                       proto = ((struct ip6_frag *)ulp)->ip6f_nxt;
+                       offset = ((struct ip6_frag *)ulp)->ip6f_offlg &
+                           IP6F_OFF_MASK;
+                       ulp = NULL;
+                       break;
+               case IPPROTO_DSTOPTS:   /* RFC 2460 */
+                       PULLUP_TO(hlen, ulp, struct ip6_hbh);
+                       hlen += (((struct ip6_hbh *)ulp)->ip6h_len + 1) << 3;
+                       proto = ((struct ip6_hbh *)ulp)->ip6h_nxt;
+                       ulp = NULL;
+                       break;
+               case IPPROTO_AH:        /* RFC 2402 */
+                       PULLUP_TO(hlen, ulp, struct ip6_ext);
+                       hlen += (((struct ip6_ext *)ulp)->ip6e_len + 2) << 2;
+                       proto = ((struct ip6_ext *)ulp)->ip6e_nxt;
+                       ulp = NULL;
+                       break;
+               default:
+                       PULLUP_TO(hlen, ulp, struct ip6_ext);
+                       break;
+               }
+       }
+
+       if (src_port == 0) {
+       receive_failed:
+               return (ENOTSUP);
        }
 
-       return (hash);
-noop:
-       *protop = proto;
+skipports:
+       dsin6->sin6_family = AF_INET6;
+       dsin6->sin6_len = sizeof(*dsin6);
+       dsin6->sin6_port = dst_port;
+       memcpy(&dsin6->sin6_addr, &ip6->ip6_dst, sizeof(struct in6_addr));
+
+       ssin6->sin6_family = AF_INET6;
+       ssin6->sin6_len = sizeof(*ssin6);
+       ssin6->sin6_port = src_port;
+       memcpy(&ssin6->sin6_addr, &ip6->ip6_src, sizeof(struct in6_addr));
+       *flags |= proto_to_flags(proto);
+
        return (0);
 }
 
+#define zero_key(key)          \
+do {                           \
+       key[0] = 0;             \
+       key[1] = 0;             \
+       key[2] = 0;             \
+       key[3] = 0;             \
+       key[4] = 0;             \
+       key[5] = 0;             \
+       key[6] = 0;             \
+       key[7] = 0;             \
+       key[8] = 0;             \
+} while (0)
+       
+static uint32_t
+ipv6_flow_lookup_hash_internal(
+       struct sockaddr_in6 *ssin6, struct sockaddr_in6 *dsin6, 
+           uint32_t *key, uint16_t flags)
+{
+       uint16_t sport, dport;
+       uint8_t proto;
+       int offset = 0;
+
+       if ((V_flowtable_enable == 0) || (V_flowtable_ready == 0))
+               return (0);
+
+       proto = flags_to_proto(flags);
+       zero_key(key);
+       sport = dport = 0;
+       if (dsin6 != NULL) {
+               memcpy(&key[1], &dsin6->sin6_addr, sizeof(struct in6_addr));
+               dport = dsin6->sin6_port;
+       }
+       if ((ssin6 != NULL) && (flags & FL_HASH_ALL)) {
+               memcpy(&key[5], &ssin6->sin6_addr, sizeof(struct in6_addr));
+               sport = ssin6->sin6_port;
+       }
+       if (flags & FL_HASH_ALL) {
+               ((uint16_t *)key)[0] = sport;
+               ((uint16_t *)key)[1] = dport; 
+       } else
+               offset = V_flow_hashjitter + proto;
+
+       return (jenkins_hashword(key, 9, offset));
+}
+
+static struct flentry *
+flowtable_lookup_mbuf6(struct flowtable *ft, struct mbuf *m)
+{
+       struct sockaddr_storage ssa, dsa;
+       struct sockaddr_in6 *dsin6, *ssin6;     
+       uint16_t flags;
+
+       dsin6 = (struct sockaddr_in6 *)&dsa;
+       ssin6 = (struct sockaddr_in6 *)&ssa;
+       flags = ft->ft_flags;
+       
+       if (ipv6_mbuf_demarshal(ft, m, ssin6, dsin6, &flags) != 0)
+               return (NULL);
+
+       return (flowtable_lookup(ft, &ssa, &dsa, M_GETFIB(m), flags));
+}
+
+void
+flow_to_route_in6(struct flentry *fle, struct route_in6 *ro)
+{
+       uint32_t *hashkey = NULL;
+       struct sockaddr_in6 *sin6;
+
+       sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
+
+       sin6->sin6_family = AF_INET6;
+       sin6->sin6_len = sizeof(*sin6);
+       hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
+       memcpy(&sin6->sin6_addr, &hashkey[5], sizeof (struct in6_addr));
+       ro->ro_rt = __DEVOLATILE(struct rtentry *, fle->f_rt);
+       ro->ro_lle = __DEVOLATILE(struct llentry *, fle->f_lle);
+
+}
+#endif /* INET6 */
+
 static bitstr_t *
 flowtable_mask(struct flowtable *ft)
 {
@@ -512,14 +901,30 @@ flowtable_set_hashkey(struct flentry *fl
                hashkey[i] = key[i];
 }
 
+
+static uint32_t *
+flowtable_get_hashkey(struct flentry *fle)
+{
+       uint32_t *hashkey;
+
+       if (fle->f_flags & FL_IPV6)
+               hashkey = ((struct flentry_v4 *)fle)->fl_flow.ipf_key;
+       else
+               hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
+
+       return (hashkey);
+}
+
 static int
 flowtable_insert(struct flowtable *ft, uint32_t hash, uint32_t *key,
-    uint8_t proto, uint32_t fibnum, struct route *ro, uint16_t flags)
+    uint32_t fibnum, struct route *ro, uint16_t flags)
 {
        struct flentry *fle, *fletail, *newfle, **flep;
+       struct flowtable_stats *fs = &ft->ft_stats[curcpu];
        int depth;
        uma_zone_t flezone;
        bitstr_t *mask;
+       uint8_t proto;
 
        flezone = (flags & FL_IPV6) ? V_flow_ipv6_zone : V_flow_ipv4_zone;
        newfle = uma_zalloc(flezone, M_NOWAIT | M_ZERO);
@@ -527,7 +932,8 @@ flowtable_insert(struct flowtable *ft, u
                return (ENOMEM);
 
        newfle->f_flags |= (flags & FL_IPV6);
-       
+       proto = flags_to_proto(flags);
+
        FL_ENTRY_LOCK(ft, hash);
        mask = flowtable_mask(ft);
        flep = flowtable_entry(ft, hash);
@@ -540,7 +946,7 @@ flowtable_insert(struct flowtable *ft, u
        } 
        
        depth = 0;
-       V_flowtable_collisions++;
+       fs->ft_collisions++;
        /*
         * find end of list and make sure that we were not
         * preempted by another thread handling this flow
@@ -554,6 +960,9 @@ flowtable_insert(struct flowtable *ft, u
                        FL_ENTRY_UNLOCK(ft, hash);
                        uma_zfree((newfle->f_flags & FL_IPV6) ?
                            V_flow_ipv6_zone : V_flow_ipv4_zone, newfle);
+
+                       if (flags & FL_OVERWRITE) 
+                               goto skip;
                        return (EEXIST);
                }
                /*
@@ -566,8 +975,8 @@ flowtable_insert(struct flowtable *ft, u
                fle = fle->f_next;
        } 
 
-       if (depth > V_flowtable_max_depth)
-               V_flowtable_max_depth = depth;
+       if (depth > fs->ft_max_depth)
+               fs->ft_max_depth = depth;
        fletail->f_next = newfle;
        fle = newfle;
 skip:
@@ -583,6 +992,35 @@ skip:
        return (0);
 }
 
+int
+kern_flowtable_insert(struct flowtable *ft,
+    struct sockaddr_storage *ssa, struct sockaddr_storage *dsa,
+    struct route *ro, uint32_t fibnum, int flags)
+{
+       uint32_t key[9], hash;
+
+       flags = (ft->ft_flags | flags | FL_OVERWRITE);
+       hash = 0;
+
+#ifdef INET
+       if (ssa->ss_family == AF_INET) 
+               hash = ipv4_flow_lookup_hash_internal((struct sockaddr_in *)ssa,
+                   (struct sockaddr_in *)dsa, key, flags);
+#endif
+#ifdef INET6
+       if (ssa->ss_family == AF_INET6) 
+               hash = ipv6_flow_lookup_hash_internal((struct sockaddr_in6 
*)ssa,
+                   (struct sockaddr_in6 *)dsa, key, flags);
+#endif 
+       if (ro->ro_rt == NULL || ro->ro_lle == NULL)
+               return (EINVAL);
+
+       FLDPRINTF(ft, FL_DEBUG,
+           "kern_flowtable_insert: key=%x:%x:%x hash=%x fibnum=%d flags=%x\n",
+           key[0], key[1], key[2], hash, fibnum, flags);
+       return (flowtable_insert(ft, hash, key, fibnum, ro, flags));
+}
+
 static int
 flowtable_key_equal(struct flentry *fle, uint32_t *key)
 {
@@ -596,7 +1034,7 @@ flowtable_key_equal(struct flentry *fle,
                nwords = 3;
                hashkey = ((struct flentry_v6 *)fle)->fl_flow.ipf_key;
        }
-       
+
        for (i = 0; i < nwords; i++) 
                if (hashkey[i] != key[i])
                        return (0);
@@ -604,44 +1042,86 @@ flowtable_key_equal(struct flentry *fle,
        return (1);
 }
 
-int
-flowtable_lookup(struct flowtable *ft, struct mbuf *m, struct route *ro, 
uint32_t fibnum)
+struct flentry *
+flowtable_lookup_mbuf(struct flowtable *ft, struct mbuf *m, int af)
+{
+       struct flentry *fle = NULL;
+
+#ifdef INET
+       if (af == AF_INET)
+               fle = flowtable_lookup_mbuf4(ft, m);
+#endif
+#ifdef INET6
+       if (af == AF_INET6)
+               fle = flowtable_lookup_mbuf6(ft, m);
+#endif 
+       if (fle != NULL && m != NULL && (m->m_flags & M_FLOWID) == 0) {
+               m->m_flags |= M_FLOWID;
+               m->m_pkthdr.flowid = fle->f_fhash;
+       }
+       return (fle);
+}
+       
+struct flentry *
+flowtable_lookup(struct flowtable *ft, struct sockaddr_storage *ssa,
+    struct sockaddr_storage *dsa, uint32_t fibnum, int flags)
 {
        uint32_t key[9], hash;
        struct flentry *fle;
-       uint16_t flags;
+       struct flowtable_stats *fs = &ft->ft_stats[curcpu];
        uint8_t proto = 0;
        int error = 0;
        struct rtentry *rt;
        struct llentry *lle;
+       struct route sro, *ro;
+       struct route_in6 sro6;
 
-       flags = ft->ft_flags;
-       ro->ro_rt = NULL;
-       ro->ro_lle = NULL;
+       sro.ro_rt = sro6.ro_rt = NULL;
+       sro.ro_lle = sro6.ro_lle = NULL;
+       ro = NULL;
+       hash = 0;
+       flags |= ft->ft_flags;
+       proto = flags_to_proto(flags);
+#ifdef INET
+       if (ssa->ss_family == AF_INET) {
+               struct sockaddr_in *ssin, *dsin;
+
+               ro = &sro;
+               memcpy(&ro->ro_dst, dsa, sizeof(struct sockaddr_in));
+               dsin = (struct sockaddr_in *)dsa;
+               ssin = (struct sockaddr_in *)ssa;
+               if ((dsin->sin_addr.s_addr == ssin->sin_addr.s_addr) ||
+                   (ntohl(dsin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == 
IN_LOOPBACKNET ||
+                   (ntohl(ssin->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) == 
IN_LOOPBACKNET)
+                       return (NULL);
 
-       /*
-        * The internal hash lookup is the only IPv4 specific bit
-        * remaining
-        *
-        * XXX BZ: to add IPv6 support just add a check for the
-        * address type in m and ro and an equivalent ipv6 lookup
-        * function - the rest of the code should automatically
-        * handle an ipv6 flow (note that m can be NULL in which
-        * case ro will be set)
-        */
-       hash = ipv4_flow_lookup_hash_internal(m, ro, key,
-           &flags, &proto);
+               hash = ipv4_flow_lookup_hash_internal(ssin, dsin, key, flags);
+       }
+#endif
+#ifdef INET6
+       if (ssa->ss_family == AF_INET6) {
+               struct sockaddr_in6 *ssin6, *dsin6;
+
+               ro = (struct route *)&sro6;
+               memcpy(&sro6.ro_dst, dsa,
+                   sizeof(struct sockaddr_in6));
+               dsin6 = (struct sockaddr_in6 *)dsa;
+               ssin6 = (struct sockaddr_in6 *)ssa;
 
+               flags |= FL_IPV6;
+               hash = ipv6_flow_lookup_hash_internal(ssin6, dsin6, key, flags);
+       }
+#endif
        /*
         * Ports are zero and this isn't a transmit cache
         * - thus not a protocol for which we need to keep 
         * state
-        * FL_HASH_PORTS => key[0] != 0 for TCP || UDP || SCTP
+        * FL_HASH_ALL => key[0] != 0 for TCP || UDP || SCTP
         */
-       if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_PORTS)))
-               return (ENOENT);
+       if (hash == 0 || (key[0] == 0 && (ft->ft_flags & FL_HASH_ALL)))
+               return (NULL);
 
-       V_flowtable_lookups++;
+       fs->ft_lookups++;
        FL_ENTRY_LOCK(ft, hash);
        if ((fle = FL_ENTRY(ft, hash)) == NULL) {
                FL_ENTRY_UNLOCK(ft, hash);
@@ -657,21 +1137,21 @@ keycheck:        
            && (fibnum == fle->f_fibnum)
            && (rt->rt_flags & RTF_UP)
            && (rt->rt_ifp != NULL)) {
-               V_flowtable_hits++;
+               fs->ft_hits++;
                fle->f_uptime = time_uptime;
                fle->f_flags |= flags;
-               ro->ro_rt = rt;
-               ro->ro_lle = lle;
                FL_ENTRY_UNLOCK(ft, hash);
-               return (0);
+               return (fle);
        } else if (fle->f_next != NULL) {
                fle = fle->f_next;
                goto keycheck;
        }
        FL_ENTRY_UNLOCK(ft, hash);
-
 uncached:
-       V_flowtable_misses++;
+       if (flags & FL_NOAUTO)
+               return (NULL);
+
+       fs->ft_misses++;
        /*
         * This bit of code ends up locking the
         * same route 3 times (just like ip_output + ether_output)
@@ -684,36 +1164,64 @@ uncached:
         * receive the route locked
         */
 
+#ifdef INVARIANTS
+       if ((ro->ro_dst.sa_family != AF_INET) &&
+           (ro->ro_dst.sa_family != AF_INET6))
+               panic("sa_family == %d\n", ro->ro_dst.sa_family);
+#endif
+
        ft->ft_rtalloc(ro, hash, fibnum);
        if (ro->ro_rt == NULL) 
                error = ENETUNREACH;
        else {
                struct llentry *lle = NULL;
-               struct sockaddr *l3addr;
+               struct sockaddr_storage *l3addr;
                struct rtentry *rt = ro->ro_rt;
                struct ifnet *ifp = rt->rt_ifp;
 
                if (ifp->if_flags & (IFF_POINTOPOINT | IFF_LOOPBACK)) {
                        RTFREE(rt);
                        ro->ro_rt = NULL;
-                       return (ENOENT);
+                       return (NULL);
                }
+#ifdef INET6
+               if (ssa->ss_family == AF_INET6) {
+                       struct sockaddr_in6 *dsin6;
+
+                       dsin6 = (struct sockaddr_in6 *)dsa;                     
+                       if (in6_localaddr(&dsin6->sin6_addr)) {
+                               RTFREE(rt);
+                               ro->ro_rt = NULL;
+                               return (NULL);                          
+                       }
 
-               if (rt->rt_flags & RTF_GATEWAY)
-                       l3addr = rt->rt_gateway;
-               else
-                       l3addr = &ro->ro_dst;
-               llentry_update(&lle, LLTABLE(ifp), l3addr, ifp);
+                       if (rt->rt_flags & RTF_GATEWAY)
+                               l3addr = (struct sockaddr_storage 
*)rt->rt_gateway;
+                       
+                       else

*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
_______________________________________________
svn-src-head@freebsd.org mailing list
http://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to