Author: ae Date: Sun Apr 14 13:08:18 2019 New Revision: 346213 URL: https://svnweb.freebsd.org/changeset/base/346213
Log: MFC r345293: Update NAT64LSN implementation: o most of data structures and relations were modified to be able support large number of translation states. Now each supported protocol can use full ports range. Ports groups now are belong to IPv4 alias addresses, not hosts. Each ports group can keep several states chunks. This is controlled with new `states_chunks` config option. States chunks allow to have several translation states for single alias address and port, but for different destination addresses. o by default all hash tables now use jenkins hash. o ConcurrencyKit and epoch(9) is used to make NAT64LSN lockless on fast path. o one NAT64LSN instance now can be used to handle several IPv6 prefixes, special prefix "::" value should be used for this purpose when instance is created. o due to modified internal data structures relations, the socket opcode that does states listing was changed. Obtained from: Yandex LLC Sponsored by: Yandex LLC MFC r345294: Remove extra spaces. Modified: stable/12/sbin/ipfw/ipfw.8 stable/12/sbin/ipfw/ipfw2.h stable/12/sbin/ipfw/nat64lsn.c stable/12/sys/conf/files stable/12/sys/modules/ipfw_nat64/Makefile stable/12/sys/netinet6/ip_fw_nat64.h stable/12/sys/netpfil/ipfw/nat64/nat64lsn.c stable/12/sys/netpfil/ipfw/nat64/nat64lsn.h stable/12/sys/netpfil/ipfw/nat64/nat64lsn_control.c Directory Properties: stable/12/ (props changed) Modified: stable/12/sbin/ipfw/ipfw.8 ============================================================================== --- stable/12/sbin/ipfw/ipfw.8 Sun Apr 14 12:39:09 2019 (r346212) +++ stable/12/sbin/ipfw/ipfw.8 Sun Apr 14 13:08:18 2019 (r346213) @@ -1,7 +1,7 @@ .\" .\" $FreeBSD$ .\" -.Dd March 18, 2019 +.Dd March 19, 2019 .Dt IPFW 8 .Os .Sh NAME @@ -3300,6 +3300,7 @@ See .Sx SYSCTL VARIABLES for more info. .Sh IPv6/IPv4 NETWORK ADDRESS AND PROTOCOL TRANSLATION +.Ss Stateful translation .Nm supports in-kernel IPv6/IPv4 network address and protocol translation. Stateful NAT64 translation allows IPv6-only clients to contact IPv4 servers @@ -3317,7 +3318,8 @@ to be able use stateful NAT64 translator. Stateful NAT64 uses a bunch of memory for several types of objects. When IPv6 client initiates connection, NAT64 translator creates a host entry in the states table. -Each host entry has a number of ports group entries allocated on demand. +Each host entry uses preallocated IPv4 alias entry. +Each alias entry has a number of ports group entries allocated on demand. Ports group entries contains connection state entries. There are several options to control limits and lifetime for these objects. .Pp @@ -3337,6 +3339,11 @@ First time an original packet is handled and consumed and then it is handled again as translated packet. This behavior can be changed by sysctl variable .Va net.inet.ip.fw.nat64_direct_output . +Also translated packet can be tagged using +.Cm tag +rule action, and then matched by +.Cm tagged +opcode to avoid loops and extra overhead. .Pp The stateful NAT64 configuration command is the following: .Bd -ragged -offset indent @@ -3364,15 +3371,16 @@ to represent IPv4 addresses. This IPv6 prefix should b The translator implementation follows RFC6052, that restricts the length of prefixes to one of following: 32, 40, 48, 56, 64, or 96. The Well-Known IPv6 Prefix 64:ff9b:: must be 96 bits long. -.It Cm max_ports Ar number -Maximum number of ports reserved for upper level protocols to one IPv6 client. -All reserved ports are divided into chunks between supported protocols. -The number of connections from one IPv6 client is limited by this option. -Note that closed TCP connections still remain in the list of connections until -.Cm tcp_close_age -interval will not expire. -Default value is -.Ar 2048 . +The special +.Ar ::/length +prefix can be used to handle several IPv6 prefixes with one NAT64 instance. +The NAT64 instance will determine a destination IPv4 address from prefix +.Ar length . +.It Cm states_chunks Ar number +The number of states chunks in single ports group. +Each ports group by default can keep 64 state entries in single chunk. +The above value affects the maximum number of states that can be associated with single IPv4 alias address and port. +The value must be power of 2, and up to 128. .It Cm host_del_age Ar seconds The number of seconds until the host entry for a IPv6 client will be deleted and all its resources will be released due to inactivity. Modified: stable/12/sbin/ipfw/ipfw2.h ============================================================================== --- stable/12/sbin/ipfw/ipfw2.h Sun Apr 14 12:39:09 2019 (r346212) +++ stable/12/sbin/ipfw/ipfw2.h Sun Apr 14 13:08:18 2019 (r346213) @@ -278,6 +278,7 @@ enum tokens { TOK_AGG_LEN, TOK_AGG_COUNT, TOK_MAX_PORTS, + TOK_STATES_CHUNKS, TOK_JMAXLEN, TOK_PORT_RANGE, TOK_HOST_DEL_AGE, Modified: stable/12/sbin/ipfw/nat64lsn.c ============================================================================== --- stable/12/sbin/ipfw/nat64lsn.c Sun Apr 14 12:39:09 2019 (r346212) +++ stable/12/sbin/ipfw/nat64lsn.c Sun Apr 14 13:08:18 2019 (r346213) @@ -87,68 +87,70 @@ nat64lsn_print_states(void *buf) char sflags[4], *sf, *proto; ipfw_obj_header *oh; ipfw_obj_data *od; - ipfw_nat64lsn_stg *stg; - ipfw_nat64lsn_state *ste; + ipfw_nat64lsn_stg_v1 *stg; + ipfw_nat64lsn_state_v1 *ste; uint64_t next_idx; int i, sz; oh = (ipfw_obj_header *)buf; od = (ipfw_obj_data *)(oh + 1); - stg = (ipfw_nat64lsn_stg *)(od + 1); + stg = (ipfw_nat64lsn_stg_v1 *)(od + 1); sz = od->head.length - sizeof(*od); next_idx = 0; while (sz > 0 && next_idx != 0xFF) { - next_idx = stg->next_idx; + next_idx = stg->next.index; sz -= sizeof(*stg); if (stg->count == 0) { stg++; continue; } - switch (stg->proto) { - case IPPROTO_TCP: - proto = "TCP"; - break; - case IPPROTO_UDP: - proto = "UDP"; - break; - case IPPROTO_ICMPV6: - proto = "ICMPv6"; - break; - } - inet_ntop(AF_INET6, &stg->host6, s, sizeof(s)); + /* + * NOTE: addresses are in network byte order, + * ports are in host byte order. + */ inet_ntop(AF_INET, &stg->alias4, a, sizeof(a)); - ste = (ipfw_nat64lsn_state *)(stg + 1); + ste = (ipfw_nat64lsn_state_v1 *)(stg + 1); for (i = 0; i < stg->count && sz > 0; i++) { sf = sflags; + inet_ntop(AF_INET6, &ste->host6, s, sizeof(s)); inet_ntop(AF_INET, &ste->daddr, f, sizeof(f)); - if (stg->proto == IPPROTO_TCP) { + switch (ste->proto) { + case IPPROTO_TCP: + proto = "TCP"; if (ste->flags & 0x02) *sf++ = 'S'; if (ste->flags & 0x04) *sf++ = 'E'; if (ste->flags & 0x01) *sf++ = 'F'; + break; + case IPPROTO_UDP: + proto = "UDP"; + break; + case IPPROTO_ICMP: + proto = "ICMPv6"; + break; } *sf = '\0'; - switch (stg->proto) { + switch (ste->proto) { case IPPROTO_TCP: case IPPROTO_UDP: printf("%s:%d\t%s:%d\t%s\t%s\t%d\t%s:%d\n", s, ste->sport, a, ste->aport, proto, sflags, ste->idle, f, ste->dport); break; - case IPPROTO_ICMPV6: + case IPPROTO_ICMP: printf("%s\t%s\t%s\t\t%d\t%s\n", s, a, proto, ste->idle, f); break; default: printf("%s\t%s\t%d\t\t%d\t%s\n", - s, a, stg->proto, ste->idle, f); + s, a, ste->proto, ste->idle, f); } ste++; sz -= sizeof(*ste); } - stg = (ipfw_nat64lsn_stg *)ste; + stg = (ipfw_nat64lsn_stg_v1 *)ste; } return (next_idx); } @@ -174,6 +176,7 @@ nat64lsn_states_cb(ipfw_nat64lsn_cfg *cfg, const char err(EX_OSERR, NULL); do { oh = (ipfw_obj_header *)buf; + oh->opheader.version = 1; /* Force using ov new API */ od = (ipfw_obj_data *)(oh + 1); nat64lsn_fill_ntlv(&oh->ntlv, cfg->name, set); od->head.type = IPFW_TLV_OBJDATA; @@ -363,12 +366,8 @@ nat64lsn_parse_int(const char *arg, const char *desc) static struct _s_x nat64newcmds[] = { { "prefix6", TOK_PREFIX6 }, - { "agg_len", TOK_AGG_LEN }, /* not yet */ - { "agg_count", TOK_AGG_COUNT }, /* not yet */ - { "port_range", TOK_PORT_RANGE }, /* not yet */ { "jmaxlen", TOK_JMAXLEN }, { "prefix4", TOK_PREFIX4 }, - { "max_ports", TOK_MAX_PORTS }, { "host_del_age", TOK_HOST_DEL_AGE }, { "pg_del_age", TOK_PG_DEL_AGE }, { "tcp_syn_age", TOK_TCP_SYN_AGE }, @@ -376,10 +375,13 @@ static struct _s_x nat64newcmds[] = { { "tcp_est_age", TOK_TCP_EST_AGE }, { "udp_age", TOK_UDP_AGE }, { "icmp_age", TOK_ICMP_AGE }, + { "states_chunks",TOK_STATES_CHUNKS }, { "log", TOK_LOG }, { "-log", TOK_LOGOFF }, { "allow_private", TOK_PRIVATE }, { "-allow_private", TOK_PRIVATEOFF }, + /* for compatibility with old configurations */ + { "max_ports", TOK_MAX_PORTS }, /* unused */ { NULL, 0 } }; @@ -436,42 +438,17 @@ nat64lsn_create(const char *name, uint8_t set, int ac, nat64lsn_parse_prefix(*av, AF_INET6, &cfg->prefix6, &cfg->plen6); if (ipfw_check_nat64prefix(&cfg->prefix6, - cfg->plen6) != 0) + cfg->plen6) != 0 && + !IN6_IS_ADDR_UNSPECIFIED(&cfg->prefix6)) errx(EX_USAGE, "Bad prefix6 %s", *av); ac--; av++; break; -#if 0 - case TOK_AGG_LEN: - NEED1("Aggregation prefix len required"); - cfg->agg_prefix_len = nat64lsn_parse_int(*av, opt); - ac--; av++; - break; - case TOK_AGG_COUNT: - NEED1("Max per-prefix count required"); - cfg->agg_prefix_max = nat64lsn_parse_int(*av, opt); - ac--; av++; - break; - case TOK_PORT_RANGE: - NEED1("port range x[:y] required"); - if ((p = strchr(*av, ':')) == NULL) - cfg->min_port = (uint16_t)nat64lsn_parse_int( - *av, opt); - else { - *p++ = '\0'; - cfg->min_port = (uint16_t)nat64lsn_parse_int( - *av, opt); - cfg->max_port = (uint16_t)nat64lsn_parse_int( - p, opt); - } - ac--; av++; - break; case TOK_JMAXLEN: NEED1("job queue length required"); cfg->jmaxlen = nat64lsn_parse_int(*av, opt); ac--; av++; break; -#endif case TOK_MAX_PORTS: NEED1("Max per-user ports required"); cfg->max_ports = nat64lsn_parse_int(*av, opt); @@ -519,6 +496,12 @@ nat64lsn_create(const char *name, uint8_t set, int ac, *av, opt); ac--; av++; break; + case TOK_STATES_CHUNKS: + NEED1("number of chunks required"); + cfg->states_chunks = (uint8_t)nat64lsn_parse_int( + *av, opt); + ac--; av++; + break; case TOK_LOG: cfg->flags |= NAT64_LOG; break; @@ -630,6 +613,12 @@ nat64lsn_config(const char *name, uint8_t set, int ac, *av, opt); ac--; av++; break; + case TOK_STATES_CHUNKS: + NEED1("number of chunks required"); + cfg->states_chunks = (uint8_t)nat64lsn_parse_int( + *av, opt); + ac--; av++; + break; case TOK_LOG: cfg->flags |= NAT64_LOG; break; @@ -789,31 +778,24 @@ nat64lsn_show_cb(ipfw_nat64lsn_cfg *cfg, const char *n printf("nat64lsn %s prefix4 %s/%u", cfg->name, abuf, cfg->plen4); inet_ntop(AF_INET6, &cfg->prefix6, abuf, sizeof(abuf)); printf(" prefix6 %s/%u", abuf, cfg->plen6); -#if 0 - printf("agg_len %u agg_count %u ", cfg->agg_prefix_len, - cfg->agg_prefix_max); - if (cfg->min_port != NAT64LSN_PORT_MIN || - cfg->max_port != NAT64LSN_PORT_MAX) - printf(" port_range %u:%u", cfg->min_port, cfg->max_port); - if (cfg->jmaxlen != NAT64LSN_JMAXLEN) - printf(" jmaxlen %u ", cfg->jmaxlen); -#endif - if (cfg->max_ports != NAT64LSN_MAX_PORTS) - printf(" max_ports %u", cfg->max_ports); - if (cfg->nh_delete_delay != NAT64LSN_HOST_AGE) + if (co.verbose || cfg->states_chunks > 1) + printf(" states_chunks %u", cfg->states_chunks); + if (co.verbose || cfg->nh_delete_delay != NAT64LSN_HOST_AGE) printf(" host_del_age %u", cfg->nh_delete_delay); - if (cfg->pg_delete_delay != NAT64LSN_PG_AGE) - printf(" pg_del_age %u ", cfg->pg_delete_delay); - if (cfg->st_syn_ttl != NAT64LSN_TCP_SYN_AGE) + if (co.verbose || cfg->pg_delete_delay != NAT64LSN_PG_AGE) + printf(" pg_del_age %u", cfg->pg_delete_delay); + if (co.verbose || cfg->st_syn_ttl != NAT64LSN_TCP_SYN_AGE) printf(" tcp_syn_age %u", cfg->st_syn_ttl); - if (cfg->st_close_ttl != NAT64LSN_TCP_FIN_AGE) + if (co.verbose || cfg->st_close_ttl != NAT64LSN_TCP_FIN_AGE) printf(" tcp_close_age %u", cfg->st_close_ttl); - if (cfg->st_estab_ttl != NAT64LSN_TCP_EST_AGE) + if (co.verbose || cfg->st_estab_ttl != NAT64LSN_TCP_EST_AGE) printf(" tcp_est_age %u", cfg->st_estab_ttl); - if (cfg->st_udp_ttl != NAT64LSN_UDP_AGE) + if (co.verbose || cfg->st_udp_ttl != NAT64LSN_UDP_AGE) printf(" udp_age %u", cfg->st_udp_ttl); - if (cfg->st_icmp_ttl != NAT64LSN_ICMP_AGE) + if (co.verbose || cfg->st_icmp_ttl != NAT64LSN_ICMP_AGE) printf(" icmp_age %u", cfg->st_icmp_ttl); + if (co.verbose || cfg->jmaxlen != NAT64LSN_JMAXLEN) + printf(" jmaxlen %u", cfg->jmaxlen); if (cfg->flags & NAT64_LOG) printf(" log"); if (cfg->flags & NAT64_ALLOW_PRIVATE) Modified: stable/12/sys/conf/files ============================================================================== --- stable/12/sys/conf/files Sun Apr 14 12:39:09 2019 (r346212) +++ stable/12/sys/conf/files Sun Apr 14 13:08:18 2019 (r346213) @@ -4448,9 +4448,9 @@ netpfil/ipfw/nat64/nat64clat.c optional inet inet6 ipf netpfil/ipfw/nat64/nat64clat_control.c optional inet inet6 ipfirewall \ ipfirewall_nat64 netpfil/ipfw/nat64/nat64lsn.c optional inet inet6 ipfirewall \ - ipfirewall_nat64 + ipfirewall_nat64 compile-with "${NORMAL_C} -I$S/contrib/ck/include" netpfil/ipfw/nat64/nat64lsn_control.c optional inet inet6 ipfirewall \ - ipfirewall_nat64 + ipfirewall_nat64 compile-with "${NORMAL_C} -I$S/contrib/ck/include" netpfil/ipfw/nat64/nat64stl.c optional inet inet6 ipfirewall \ ipfirewall_nat64 netpfil/ipfw/nat64/nat64stl_control.c optional inet inet6 ipfirewall \ Modified: stable/12/sys/modules/ipfw_nat64/Makefile ============================================================================== --- stable/12/sys/modules/ipfw_nat64/Makefile Sun Apr 14 12:39:09 2019 (r346212) +++ stable/12/sys/modules/ipfw_nat64/Makefile Sun Apr 14 13:08:18 2019 (r346213) @@ -8,4 +8,6 @@ SRCS+= nat64clat.c nat64clat_control.c SRCS+= nat64lsn.c nat64lsn_control.c SRCS+= nat64stl.c nat64stl_control.c +CFLAGS+= -I${SRCTOP}/sys/contrib/ck/include + .include <bsd.kmod.mk> Modified: stable/12/sys/netinet6/ip_fw_nat64.h ============================================================================== --- stable/12/sys/netinet6/ip_fw_nat64.h Sun Apr 14 12:39:09 2019 (r346212) +++ stable/12/sys/netinet6/ip_fw_nat64.h Sun Apr 14 13:08:18 2019 (r346213) @@ -122,7 +122,7 @@ typedef struct _ipfw_nat64clat_cfg { /* * NAT64LSN default configuration values */ -#define NAT64LSN_MAX_PORTS 2048 /* Max number of ports per host */ +#define NAT64LSN_MAX_PORTS 2048 /* Unused */ #define NAT64LSN_JMAXLEN 2048 /* Max outstanding requests. */ #define NAT64LSN_TCP_SYN_AGE 10 /* State's TTL after SYN received. */ #define NAT64LSN_TCP_EST_AGE (2 * 3600) /* TTL for established connection */ @@ -135,16 +135,20 @@ typedef struct _ipfw_nat64clat_cfg { typedef struct _ipfw_nat64lsn_cfg { char name[64]; /* NAT name */ uint32_t flags; - uint32_t max_ports; /* Max ports per client */ - uint32_t agg_prefix_len; /* Prefix length to count */ - uint32_t agg_prefix_max; /* Max hosts per agg prefix */ + + uint32_t max_ports; /* Unused */ + uint32_t agg_prefix_len; /* Unused */ + uint32_t agg_prefix_max; /* Unused */ + struct in_addr prefix4; uint16_t plen4; /* Prefix length */ uint16_t plen6; /* Prefix length */ struct in6_addr prefix6; /* NAT64 prefix */ uint32_t jmaxlen; /* Max jobqueue length */ - uint16_t min_port; /* Min port group # to use */ - uint16_t max_port; /* Max port group # to use */ + + uint16_t min_port; /* Unused */ + uint16_t max_port; /* Unused */ + uint16_t nh_delete_delay;/* Stale host delete delay */ uint16_t pg_delete_delay;/* Stale portgroup delete delay */ uint16_t st_syn_ttl; /* TCP syn expire */ @@ -153,7 +157,7 @@ typedef struct _ipfw_nat64lsn_cfg { uint16_t st_udp_ttl; /* UDP expire */ uint16_t st_icmp_ttl; /* ICMP expire */ uint8_t set; /* Named instance set [0..31] */ - uint8_t spare; + uint8_t states_chunks; /* Number of states chunks per PG */ } ipfw_nat64lsn_cfg; typedef struct _ipfw_nat64lsn_state { @@ -177,5 +181,30 @@ typedef struct _ipfw_nat64lsn_stg { uint32_t spare2; } ipfw_nat64lsn_stg; -#endif /* _NETINET6_IP_FW_NAT64_H_ */ +typedef struct _ipfw_nat64lsn_state_v1 { + struct in6_addr host6; /* Bound IPv6 host */ + struct in_addr daddr; /* Remote IPv4 address */ + uint16_t dport; /* Remote destination port */ + uint16_t aport; /* Local alias port */ + uint16_t sport; /* Source port */ + uint16_t spare; + uint16_t idle; /* Last used time */ + uint8_t flags; /* State flags */ + uint8_t proto; /* protocol */ +} ipfw_nat64lsn_state_v1; +typedef struct _ipfw_nat64lsn_stg_v1 { + union nat64lsn_pgidx { + uint64_t index; + struct { + uint8_t chunk; /* states chunk */ + uint8_t proto; /* protocol */ + uint16_t port; /* base port */ + in_addr_t addr; /* alias address */ + }; + } next; /* next state index */ + struct in_addr alias4; /* IPv4 alias address */ + uint32_t count; /* Number of states */ +} ipfw_nat64lsn_stg_v1; + +#endif /* _NETINET6_IP_FW_NAT64_H_ */ Modified: stable/12/sys/netpfil/ipfw/nat64/nat64lsn.c ============================================================================== --- stable/12/sys/netpfil/ipfw/nat64/nat64lsn.c Sun Apr 14 12:39:09 2019 (r346212) +++ stable/12/sys/netpfil/ipfw/nat64/nat64lsn.c Sun Apr 14 13:08:18 2019 (r346213) @@ -33,16 +33,17 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/counter.h> +#include <sys/ck.h> +#include <sys/epoch.h> #include <sys/errno.h> +#include <sys/hash.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/module.h> #include <sys/rmlock.h> -#include <sys/rwlock.h> #include <sys/socket.h> -#include <sys/queue.h> #include <sys/syslog.h> #include <sys/sysctl.h> @@ -71,17 +72,22 @@ __FBSDID("$FreeBSD$"); MALLOC_DEFINE(M_NAT64LSN, "NAT64LSN", "NAT64LSN"); -static void nat64lsn_periodic(void *data); -#define PERIODIC_DELAY 4 -static uint8_t nat64lsn_proto_map[256]; -uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO]; +static epoch_t nat64lsn_epoch; +#define NAT64LSN_EPOCH_ENTER(et) epoch_enter_preempt(nat64lsn_epoch, &(et)) +#define NAT64LSN_EPOCH_EXIT(et) epoch_exit_preempt(nat64lsn_epoch, &(et)) +#define NAT64LSN_EPOCH_WAIT() epoch_wait_preempt(nat64lsn_epoch) +#define NAT64LSN_EPOCH_ASSERT() MPASS(in_epoch(nat64lsn_epoch)) +#define NAT64LSN_EPOCH_CALL(c, f) epoch_call(nat64lsn_epoch, (c), (f)) -#define NAT64_FLAG_FIN 0x01 /* FIN was seen */ -#define NAT64_FLAG_SYN 0x02 /* First syn in->out */ -#define NAT64_FLAG_ESTAB 0x04 /* Packet with Ack */ -#define NAT64_FLAGS_TCP (NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN) +static uma_zone_t nat64lsn_host_zone; +static uma_zone_t nat64lsn_pgchunk_zone; +static uma_zone_t nat64lsn_pg_zone; +static uma_zone_t nat64lsn_aliaslink_zone; +static uma_zone_t nat64lsn_state_zone; +static uma_zone_t nat64lsn_job_zone; -#define NAT64_FLAG_RDR 0x80 /* Port redirect */ +static void nat64lsn_periodic(void *data); +#define PERIODIC_DELAY 4 #define NAT64_LOOKUP(chain, cmd) \ (struct nat64lsn_cfg *)SRV_OBJECT((chain), (cmd)->arg1) /* @@ -91,25 +97,33 @@ uint8_t nat64lsn_rproto_map[NAT_MAX_PROTO]; enum nat64lsn_jtype { JTYPE_NEWHOST = 1, JTYPE_NEWPORTGROUP, - JTYPE_DELPORTGROUP, + JTYPE_DESTROY, }; struct nat64lsn_job_item { - TAILQ_ENTRY(nat64lsn_job_item) next; + STAILQ_ENTRY(nat64lsn_job_item) entries; enum nat64lsn_jtype jtype; - struct nat64lsn_host *nh; - struct nat64lsn_portgroup *pg; - void *spare_idx; - struct in6_addr haddr; - uint8_t nat_proto; - uint8_t done; - int needs_idx; - int delcount; - unsigned int fhash; /* Flow hash */ - uint32_t aaddr; /* Last used address (net) */ - struct mbuf *m; - struct ipfw_flow_id f_id; - uint64_t delmask[NAT64LSN_PGPTRNMASK]; + + union { + struct { /* used by JTYPE_NEWHOST, JTYPE_NEWPORTGROUP */ + struct mbuf *m; + struct nat64lsn_host *host; + struct nat64lsn_state *state; + uint32_t src6_hval; + uint32_t state_hval; + struct ipfw_flow_id f_id; + in_addr_t faddr; + uint16_t port; + uint8_t proto; + uint8_t done; + }; + struct { /* used by JTYPE_DESTROY */ + struct nat64lsn_hosts_slist hosts; + struct nat64lsn_pg_slist portgroups; + struct nat64lsn_pgchunk *pgchunk; + struct epoch_context epoch_ctx; + }; + }; }; static struct mtx jmtx; @@ -118,143 +132,311 @@ static struct mtx jmtx; #define JQUEUE_LOCK() mtx_lock(&jmtx) #define JQUEUE_UNLOCK() mtx_unlock(&jmtx) +static int nat64lsn_alloc_host(struct nat64lsn_cfg *cfg, + struct nat64lsn_job_item *ji); +static int nat64lsn_alloc_pg(struct nat64lsn_cfg *cfg, + struct nat64lsn_job_item *ji); +static struct nat64lsn_job_item *nat64lsn_create_job( + struct nat64lsn_cfg *cfg, int jtype); static void nat64lsn_enqueue_job(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); -static void nat64lsn_enqueue_jobs(struct nat64lsn_cfg *cfg, - struct nat64lsn_job_head *jhead, int jlen); +static void nat64lsn_job_destroy(epoch_context_t ctx); +static void nat64lsn_destroy_host(struct nat64lsn_host *host); +static void nat64lsn_destroy_pg(struct nat64lsn_pg *pg); -static struct nat64lsn_job_item *nat64lsn_create_job(struct nat64lsn_cfg *cfg, - const struct ipfw_flow_id *f_id, int jtype); -static int nat64lsn_request_portgroup(struct nat64lsn_cfg *cfg, - const struct ipfw_flow_id *f_id, struct mbuf **pm, uint32_t aaddr, - int needs_idx); -static int nat64lsn_request_host(struct nat64lsn_cfg *cfg, - const struct ipfw_flow_id *f_id, struct mbuf **pm); static int nat64lsn_translate4(struct nat64lsn_cfg *cfg, - const struct ipfw_flow_id *f_id, struct mbuf **pm); + const struct ipfw_flow_id *f_id, struct mbuf **mp); static int nat64lsn_translate6(struct nat64lsn_cfg *cfg, - struct ipfw_flow_id *f_id, struct mbuf **pm); + struct ipfw_flow_id *f_id, struct mbuf **mp); +static int nat64lsn_translate6_internal(struct nat64lsn_cfg *cfg, + struct mbuf **mp, struct nat64lsn_state *state, uint8_t flags); -static int alloc_portgroup(struct nat64lsn_job_item *ji); -static void destroy_portgroup(struct nat64lsn_portgroup *pg); -static void destroy_host6(struct nat64lsn_host *nh); -static int alloc_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); +#define NAT64_BIT_TCP_FIN 0 /* FIN was seen */ +#define NAT64_BIT_TCP_SYN 1 /* First syn in->out */ +#define NAT64_BIT_TCP_ESTAB 2 /* Packet with Ack */ +#define NAT64_BIT_READY_IPV4 6 /* state is ready for translate4 */ +#define NAT64_BIT_STALE 7 /* state is going to be expired */ -static int attach_portgroup(struct nat64lsn_cfg *cfg, - struct nat64lsn_job_item *ji); -static int attach_host6(struct nat64lsn_cfg *cfg, struct nat64lsn_job_item *ji); +#define NAT64_FLAG_FIN (1 << NAT64_BIT_TCP_FIN) +#define NAT64_FLAG_SYN (1 << NAT64_BIT_TCP_SYN) +#define NAT64_FLAG_ESTAB (1 << NAT64_BIT_TCP_ESTAB) +#define NAT64_FLAGS_TCP (NAT64_FLAG_SYN|NAT64_FLAG_ESTAB|NAT64_FLAG_FIN) +#define NAT64_FLAG_READY (1 << NAT64_BIT_READY_IPV4) +#define NAT64_FLAG_STALE (1 << NAT64_BIT_STALE) -/* XXX tmp */ -static uma_zone_t nat64lsn_host_zone; -static uma_zone_t nat64lsn_pg_zone; -static uma_zone_t nat64lsn_pgidx_zone; +static inline uint8_t +convert_tcp_flags(uint8_t flags) +{ + uint8_t result; -static unsigned int nat64lsn_periodic_chkstates(struct nat64lsn_cfg *cfg, - struct nat64lsn_host *nh); + result = flags & (TH_FIN|TH_SYN); + result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */ + result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */ -#define I6_hash(x) (djb_hash((const unsigned char *)(x), 16)) -#define I6_first(_ph, h) (_ph)[h] -#define I6_next(x) (x)->next -#define I6_val(x) (&(x)->addr) -#define I6_cmp(a, b) IN6_ARE_ADDR_EQUAL(a, b) -#define I6_lock(a, b) -#define I6_unlock(a, b) + return (result); +} -#define I6HASH_FIND(_cfg, _res, _a) \ - CHT_FIND(_cfg->ih, _cfg->ihsize, I6_, _res, _a) -#define I6HASH_INSERT(_cfg, _i) \ - CHT_INSERT_HEAD(_cfg->ih, _cfg->ihsize, I6_, _i) -#define I6HASH_REMOVE(_cfg, _res, _tmp, _a) \ - CHT_REMOVE(_cfg->ih, _cfg->ihsize, I6_, _res, _tmp, _a) +static void +nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family, + struct nat64lsn_state *state) +{ -#define I6HASH_FOREACH_SAFE(_cfg, _x, _tmp, _cb, _arg) \ - CHT_FOREACH_SAFE(_cfg->ih, _cfg->ihsize, I6_, _x, _tmp, _cb, _arg) + memset(plog, 0, sizeof(*plog)); + plog->length = PFLOG_REAL_HDRLEN; + plog->af = family; + plog->action = PF_NAT; + plog->dir = PF_IN; + plog->rulenr = htonl(state->ip_src); + plog->subrulenr = htonl((uint32_t)(state->aport << 16) | + (state->proto << 8) | (state->ip_dst & 0xff)); + plog->ruleset[0] = '\0'; + strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname)); + ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m); +} -#define HASH_IN4(x) djb_hash((const unsigned char *)(x), 8) +#define HVAL(p, n, s) jenkins_hash32((const uint32_t *)(p), (n), (s)) +#define HOST_HVAL(c, a) HVAL((a),\ + sizeof(struct in6_addr) / sizeof(uint32_t), (c)->hash_seed) +#define HOSTS(c, v) ((c)->hosts_hash[(v) & ((c)->hosts_hashsize - 1)]) -static unsigned -djb_hash(const unsigned char *h, const int len) +#define ALIASLINK_HVAL(c, f) HVAL(&(f)->dst_ip6,\ + sizeof(struct in6_addr) * 2 / sizeof(uint32_t), (c)->hash_seed) +#define ALIAS_BYHASH(c, v) \ + ((c)->aliases[(v) & ((1 << (32 - (c)->plen4)) - 1)]) +static struct nat64lsn_aliaslink* +nat64lsn_get_aliaslink(struct nat64lsn_cfg *cfg __unused, + struct nat64lsn_host *host, const struct ipfw_flow_id *f_id __unused) { - unsigned int result = 0; - int i; - for (i = 0; i < len; i++) - result = 33 * result ^ h[i]; - - return (result); + /* + * We can implement some different algorithms how + * select an alias address. + * XXX: for now we use first available. + */ + return (CK_SLIST_FIRST(&host->aliases)); } -/* -static size_t -bitmask_size(size_t num, int *level) +#define STATE_HVAL(c, d) HVAL((d), 2, (c)->hash_seed) +#define STATE_HASH(h, v) \ + ((h)->states_hash[(v) & ((h)->states_hashsize - 1)]) +#define STATES_CHUNK(p, v) \ + ((p)->chunks_count == 1 ? (p)->states : \ + ((p)->states_chunk[CHUNK_BY_FADDR(p, v)])) + +#ifdef __LP64__ +#define FREEMASK_FFSLL(pg, faddr) \ + ffsll(*FREEMASK_CHUNK((pg), (faddr))) +#define FREEMASK_BTR(pg, faddr, bit) \ + ck_pr_btr_64(FREEMASK_CHUNK((pg), (faddr)), (bit)) +#define FREEMASK_BTS(pg, faddr, bit) \ + ck_pr_bts_64(FREEMASK_CHUNK((pg), (faddr)), (bit)) +#define FREEMASK_ISSET(pg, faddr, bit) \ + ISSET64(*FREEMASK_CHUNK((pg), (faddr)), (bit)) +#define FREEMASK_COPY(pg, n, out) \ + (out) = ck_pr_load_64(FREEMASK_CHUNK((pg), (n))) +#else +static inline int +freemask_ffsll(uint32_t *freemask) { - size_t x; - int c; + int i; - for (c = 0, x = num; num > 1; num /= 64, c++) - ; - - return (x); + if ((i = ffsl(freemask[0])) != 0) + return (i); + if ((i = ffsl(freemask[1])) != 0) + return (i + 32); + return (0); } +#define FREEMASK_FFSLL(pg, faddr) \ + freemask_ffsll(FREEMASK_CHUNK((pg), (faddr))) +#define FREEMASK_BTR(pg, faddr, bit) \ + ck_pr_btr_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32) +#define FREEMASK_BTS(pg, faddr, bit) \ + ck_pr_bts_32(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32, (bit) % 32) +#define FREEMASK_ISSET(pg, faddr, bit) \ + ISSET32(*(FREEMASK_CHUNK((pg), (faddr)) + (bit) / 32), (bit) % 32) +#define FREEMASK_COPY(pg, n, out) \ + (out) = ck_pr_load_32(FREEMASK_CHUNK((pg), (n))) | \ + ((uint64_t)ck_pr_load_32(FREEMASK_CHUNK((pg), (n)) + 1) << 32) +#endif /* !__LP64__ */ -static void -bitmask_prepare(uint64_t *pmask, size_t bufsize, int level) + +#define NAT64LSN_TRY_PGCNT 32 +static struct nat64lsn_pg* +nat64lsn_get_pg(uint32_t *chunkmask, uint32_t *pgmask, + struct nat64lsn_pgchunk **chunks, struct nat64lsn_pg **pgptr, + uint32_t *pgidx, in_addr_t faddr) { - size_t x, z; + struct nat64lsn_pg *pg, *oldpg; + uint32_t idx, oldidx; + int cnt; - memset(pmask, 0xFF, bufsize); - for (x = 0, z = 1; level > 1; x += z, z *= 64, level--) - ; - pmask[x] ~= 0x01; + cnt = 0; + /* First try last used PG */ + oldpg = pg = ck_pr_load_ptr(pgptr); + idx = oldidx = ck_pr_load_32(pgidx); + /* If pgidx is out of range, reset it to the first pgchunk */ + if (!ISSET32(*chunkmask, idx / 32)) + idx = 0; + do { + ck_pr_fence_load(); + if (pg != NULL && FREEMASK_BITCOUNT(pg, faddr) > 0) { + /* + * If last used PG has not free states, + * try to update pointer. + * NOTE: it can be already updated by jobs handler, + * thus we use CAS operation. + */ + if (cnt > 0) + ck_pr_cas_ptr(pgptr, oldpg, pg); + return (pg); + } + /* Stop if idx is out of range */ + if (!ISSET32(*chunkmask, idx / 32)) + break; + + if (ISSET32(pgmask[idx / 32], idx % 32)) + pg = ck_pr_load_ptr( + &chunks[idx / 32]->pgptr[idx % 32]); + else + pg = NULL; + + idx++; + } while (++cnt < NAT64LSN_TRY_PGCNT); + + /* If pgidx is out of range, reset it to the first pgchunk */ + if (!ISSET32(*chunkmask, idx / 32)) + idx = 0; + ck_pr_cas_32(pgidx, oldidx, idx); + return (NULL); } -*/ -static void -nat64lsn_log(struct pfloghdr *plog, struct mbuf *m, sa_family_t family, - uint32_t n, uint32_t sn) +static struct nat64lsn_state* +nat64lsn_get_state6to4(struct nat64lsn_cfg *cfg, struct nat64lsn_host *host, + const struct ipfw_flow_id *f_id, uint32_t hval, in_addr_t faddr, + uint16_t port, uint8_t proto) { + struct nat64lsn_aliaslink *link; + struct nat64lsn_state *state; + struct nat64lsn_pg *pg; + int i, offset; - memset(plog, 0, sizeof(*plog)); - plog->length = PFLOG_REAL_HDRLEN; - plog->af = family; - plog->action = PF_NAT; - plog->dir = PF_IN; - plog->rulenr = htonl(n); - plog->subrulenr = htonl(sn); - plog->ruleset[0] = '\0'; - strlcpy(plog->ifname, "NAT64LSN", sizeof(plog->ifname)); - ipfw_bpf_mtap2(plog, PFLOG_HDRLEN, m); + NAT64LSN_EPOCH_ASSERT(); + + /* Check that we already have state for given arguments */ + CK_SLIST_FOREACH(state, &STATE_HASH(host, hval), entries) { + if (state->proto == proto && state->ip_dst == faddr && + state->sport == port && state->dport == f_id->dst_port) + return (state); + } + + link = nat64lsn_get_aliaslink(cfg, host, f_id); + if (link == NULL) + return (NULL); + + switch (proto) { + case IPPROTO_TCP: + pg = nat64lsn_get_pg( + &link->alias->tcp_chunkmask, link->alias->tcp_pgmask, + link->alias->tcp, &link->alias->tcp_pg, + &link->alias->tcp_pgidx, faddr); + break; + case IPPROTO_UDP: + pg = nat64lsn_get_pg( + &link->alias->udp_chunkmask, link->alias->udp_pgmask, + link->alias->udp, &link->alias->udp_pg, + &link->alias->udp_pgidx, faddr); + break; + case IPPROTO_ICMP: + pg = nat64lsn_get_pg( + &link->alias->icmp_chunkmask, link->alias->icmp_pgmask, + link->alias->icmp, &link->alias->icmp_pg, + &link->alias->icmp_pgidx, faddr); + break; + default: + panic("%s: wrong proto %d", __func__, proto); + } + if (pg == NULL) + return (NULL); + + /* Check that PG has some free states */ + state = NULL; + i = FREEMASK_BITCOUNT(pg, faddr); + while (i-- > 0) { + offset = FREEMASK_FFSLL(pg, faddr); + if (offset == 0) { + /* + * We lost the race. + * No more free states in this PG. + */ + break; + } + + /* Lets try to atomically grab the state */ + if (FREEMASK_BTR(pg, faddr, offset - 1)) { + state = &STATES_CHUNK(pg, faddr)->state[offset - 1]; + /* Initialize */ + state->flags = proto != IPPROTO_TCP ? 0 : + convert_tcp_flags(f_id->_flags); + state->proto = proto; + state->aport = pg->base_port + offset - 1; + state->dport = f_id->dst_port; + state->sport = port; + state->ip6_dst = f_id->dst_ip6; + state->ip_dst = faddr; + state->ip_src = link->alias->addr; + state->hval = hval; + state->host = host; + SET_AGE(state->timestamp); + + /* Insert new state into host's hash table */ + HOST_LOCK(host); + CK_SLIST_INSERT_HEAD(&STATE_HASH(host, hval), + state, entries); + host->states_count++; + /* + * XXX: In case if host is going to be expired, + * reset NAT64LSN_DEADHOST flag. + */ + host->flags &= ~NAT64LSN_DEADHOST; + HOST_UNLOCK(host); + NAT64STAT_INC(&cfg->base.stats, screated); + /* Mark the state as ready for translate4 */ + ck_pr_fence_store(); + ck_pr_bts_32(&state->flags, NAT64_BIT_READY_IPV4); + break; + } + } + return (state); } + /* * Inspects icmp packets to see if the message contains different * packet header so we need to alter @addr and @port. */ static int -inspect_icmp_mbuf(struct mbuf **m, uint8_t *nat_proto, uint32_t *addr, +inspect_icmp_mbuf(struct mbuf **mp, uint8_t *proto, uint32_t *addr, uint16_t *port) { + struct icmp *icmp; struct ip *ip; - struct tcphdr *tcp; - struct udphdr *udp; - struct icmphdr *icmp; int off; - uint8_t proto; + uint8_t inner_proto; - ip = mtod(*m, struct ip *); /* Outer IP header */ + ip = mtod(*mp, struct ip *); /* Outer IP header */ off = (ip->ip_hl << 2) + ICMP_MINLEN; - if ((*m)->m_len < off) - *m = m_pullup(*m, off); - if (*m == NULL) + if ((*mp)->m_len < off) + *mp = m_pullup(*mp, off); + if (*mp == NULL) return (ENOMEM); - ip = mtod(*m, struct ip *); /* Outer IP header */ - icmp = L3HDR(ip, struct icmphdr *); + ip = mtod(*mp, struct ip *); /* Outer IP header */ + icmp = L3HDR(ip, struct icmp *); switch (icmp->icmp_type) { case ICMP_ECHO: case ICMP_ECHOREPLY: /* Use icmp ID as distinguisher */ - *port = ntohs(*((uint16_t *)(icmp + 1))); + *port = ntohs(icmp->icmp_id); return (0); case ICMP_UNREACH: case ICMP_TIMXCEED: @@ -266,90 +448,133 @@ inspect_icmp_mbuf(struct mbuf **m, uint8_t *nat_proto, * ICMP_UNREACH and ICMP_TIMXCEED contains IP header + 64 bits * of ULP header. */ - if ((*m)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN) + if ((*mp)->m_pkthdr.len < off + sizeof(struct ip) + ICMP_MINLEN) return (EINVAL); - if ((*m)->m_len < off + sizeof(struct ip) + ICMP_MINLEN) - *m = m_pullup(*m, off + sizeof(struct ip) + ICMP_MINLEN); - if (*m == NULL) + if ((*mp)->m_len < off + sizeof(struct ip) + ICMP_MINLEN) + *mp = m_pullup(*mp, off + sizeof(struct ip) + ICMP_MINLEN); + if (*mp == NULL) return (ENOMEM); - ip = mtodo(*m, off); /* Inner IP header */ - proto = ip->ip_p; + ip = mtodo(*mp, off); /* Inner IP header */ + inner_proto = ip->ip_p; off += ip->ip_hl << 2; /* Skip inner IP header */ *addr = ntohl(ip->ip_src.s_addr); - if ((*m)->m_len < off + ICMP_MINLEN) - *m = m_pullup(*m, off + ICMP_MINLEN); - if (*m == NULL) + if ((*mp)->m_len < off + ICMP_MINLEN) + *mp = m_pullup(*mp, off + ICMP_MINLEN); + if (*mp == NULL) return (ENOMEM); - switch (proto) { + switch (inner_proto) { case IPPROTO_TCP: - tcp = mtodo(*m, off); - *nat_proto = NAT_PROTO_TCP; - *port = ntohs(tcp->th_sport); - return (0); case IPPROTO_UDP: - udp = mtodo(*m, off); - *nat_proto = NAT_PROTO_UDP; - *port = ntohs(udp->uh_sport); + /* Copy source port from the header */ + *port = ntohs(*((uint16_t *)mtodo(*mp, off))); + *proto = inner_proto; return (0); case IPPROTO_ICMP: /* * We will translate only ICMP errors for our ICMP * echo requests. */ - icmp = mtodo(*m, off); + icmp = mtodo(*mp, off); if (icmp->icmp_type != ICMP_ECHO) return (EOPNOTSUPP); - *port = ntohs(*((uint16_t *)(icmp + 1))); + *port = ntohs(icmp->icmp_id); return (0); }; return (EOPNOTSUPP); } -static inline uint8_t -convert_tcp_flags(uint8_t flags) +static struct nat64lsn_state* +nat64lsn_get_state4to6(struct nat64lsn_cfg *cfg, struct nat64lsn_alias *alias, + in_addr_t faddr, uint16_t port, uint8_t proto) { - uint8_t result; + struct nat64lsn_state *state; + struct nat64lsn_pg *pg; + int chunk_idx, pg_idx, state_idx; - result = flags & (TH_FIN|TH_SYN); - result |= (flags & TH_RST) >> 2; /* Treat RST as FIN */ - result |= (flags & TH_ACK) >> 2; /* Treat ACK as estab */ *** DIFF OUTPUT TRUNCATED AT 1000 LINES *** _______________________________________________ svn-src-all@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-all To unsubscribe, send any mail to "svn-src-all-unsubscr...@freebsd.org"