Module Name: src
Committed By: rmind
Date: Tue Jul 13 22:16:10 UTC 2010
Modified Files:
src/sys/netinet: files.netinet in_var.h ip_input.c ip_var.h
src/sys/rump/net/lib/libnetinet: Makefile.inc
Added Files:
src/sys/netinet: ip_reass.c
Log Message:
Split-off IPv4 re-assembly mechanism into a separate module. Abstract
into ip_reass_init(), ip_reass_lookup(), etc (note: abstraction is not
yet complete). No functional changes to the actual mechanism.
OK matt@
To generate a diff of this commit:
cvs rdiff -u -r1.20 -r1.21 src/sys/netinet/files.netinet
cvs rdiff -u -r1.62 -r1.63 src/sys/netinet/in_var.h
cvs rdiff -u -r1.287 -r1.288 src/sys/netinet/ip_input.c
cvs rdiff -u -r0 -r1.1 src/sys/netinet/ip_reass.c
cvs rdiff -u -r1.91 -r1.92 src/sys/netinet/ip_var.h
cvs rdiff -u -r1.5 -r1.6 src/sys/rump/net/lib/libnetinet/Makefile.inc
Please note that diffs are not public domain; they are subject to the
copyright notices on the relevant files.
Modified files:
Index: src/sys/netinet/files.netinet
diff -u src/sys/netinet/files.netinet:1.20 src/sys/netinet/files.netinet:1.21
--- src/sys/netinet/files.netinet:1.20 Fri Jan 25 21:12:14 2008
+++ src/sys/netinet/files.netinet Tue Jul 13 22:16:10 2010
@@ -1,4 +1,4 @@
-# $NetBSD: files.netinet,v 1.20 2008/01/25 21:12:14 joerg Exp $
+# $NetBSD: files.netinet,v 1.21 2010/07/13 22:16:10 rmind Exp $
defflag opt_tcp_debug.h TCP_DEBUG
defparam opt_tcp_debug.h TCP_NDEBUG
@@ -29,6 +29,7 @@
file netinet/ip_input.c inet
file netinet/ip_mroute.c inet & mrouting
file netinet/ip_output.c inet
+file netinet/ip_reass.c inet
file netinet/raw_ip.c inet
file netinet/tcp_debug.c (inet | inet6) & tcp_debug
Index: src/sys/netinet/in_var.h
diff -u src/sys/netinet/in_var.h:1.62 src/sys/netinet/in_var.h:1.63
--- src/sys/netinet/in_var.h:1.62 Mon Apr 28 20:24:09 2008
+++ src/sys/netinet/in_var.h Tue Jul 13 22:16:10 2010
@@ -1,4 +1,4 @@
-/* $NetBSD: in_var.h,v 1.62 2008/04/28 20:24:09 martin Exp $ */
+/* $NetBSD: in_var.h,v 1.63 2010/07/13 22:16:10 rmind Exp $ */
/*-
* Copyright (c) 1998 The NetBSD Foundation, Inc.
@@ -300,6 +300,7 @@
struct lwp *);
void in_purgeaddr(struct ifaddr *);
void in_purgeif(struct ifnet *);
+void ip_reass_init(void);
void ip_input(struct mbuf *);
int ipflow_fastforward(struct mbuf *);
void ip_initid(void);
Index: src/sys/netinet/ip_input.c
diff -u src/sys/netinet/ip_input.c:1.287 src/sys/netinet/ip_input.c:1.288
--- src/sys/netinet/ip_input.c:1.287 Fri Jul 9 18:42:46 2010
+++ src/sys/netinet/ip_input.c Tue Jul 13 22:16:10 2010
@@ -1,4 +1,4 @@
-/* $NetBSD: ip_input.c,v 1.287 2010/07/09 18:42:46 rmind Exp $ */
+/* $NetBSD: ip_input.c,v 1.288 2010/07/13 22:16:10 rmind Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@@ -91,7 +91,7 @@
*/
#include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.287 2010/07/09 18:42:46 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.288 2010/07/13 22:16:10 rmind Exp $");
#include "opt_inet.h"
#include "opt_compat_netbsd.h"
@@ -104,7 +104,6 @@
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
@@ -240,105 +239,7 @@
struct pfil_head inet_pfil_hook;
#endif
-/*
- * Cached copy of nmbclusters. If nbclusters is different,
- * recalculate IP parameters derived from nmbclusters.
- */
-static int ip_nmbclusters; /* copy of nmbclusters */
-static void ip_nmbclusters_changed(void); /* recalc limits */
-
-#define CHECK_NMBCLUSTER_PARAMS() \
-do { \
- if (__predict_false(ip_nmbclusters != nmbclusters)) \
- ip_nmbclusters_changed(); \
-} while (/*CONSTCOND*/0)
-
-/* IP datagram reassembly queues (hashed) */
-#define IPREASS_NHASH_LOG2 6
-#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
-#define IPREASS_HMASK (IPREASS_NHASH - 1)
-#define IPREASS_HASH(x,y) \
- (((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
-struct ipqhead ipq[IPREASS_NHASH];
-int ipq_locked;
-static int ip_nfragpackets; /* packets in reass queue */
-static int ip_nfrags; /* total fragments in reass queues */
-
-int ip_maxfragpackets = 200; /* limit on packets. XXX sysctl */
-int ip_maxfrags; /* limit on fragments. XXX sysctl */
-
-
-/*
- * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for
- * IP reassembly queue buffer managment.
- *
- * We keep a count of total IP fragments (NB: not fragmented packets!)
- * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments.
- * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the
- * total fragments in reassembly queues.This AIMD policy avoids
- * repeatedly deleting single packets under heavy fragmentation load
- * (e.g., from lossy NFS peers).
- */
-static u_int ip_reass_ttl_decr(u_int ticks);
-static void ip_reass_drophalf(void);
-
-
-static inline int ipq_lock_try(void);
-static inline void ipq_unlock(void);
-
-static inline int
-ipq_lock_try(void)
-{
- int s;
-
- /*
- * Use splvm() -- we're blocking things that would cause
- * mbuf allocation.
- */
- s = splvm();
- if (ipq_locked) {
- splx(s);
- return (0);
- }
- ipq_locked = 1;
- splx(s);
- return (1);
-}
-
-static inline void
-ipq_unlock(void)
-{
- int s;
-
- s = splvm();
- ipq_locked = 0;
- splx(s);
-}
-
-#ifdef DIAGNOSTIC
-#define IPQ_LOCK() \
-do { \
- if (ipq_lock_try() == 0) { \
- printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \
- panic("ipq_lock"); \
- } \
-} while (/*CONSTCOND*/ 0)
-#define IPQ_LOCK_CHECK() \
-do { \
- if (ipq_locked == 0) { \
- printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \
- panic("ipq lock check"); \
- } \
-} while (/*CONSTCOND*/ 0)
-#else
-#define IPQ_LOCK() (void) ipq_lock_try()
-#define IPQ_LOCK_CHECK() /* nothing */
-#endif
-
-#define IPQ_UNLOCK() ipq_unlock()
-
struct pool inmulti_pool;
-struct pool ipqent_pool;
#ifdef INET_CSUM_COUNTERS
#include <sys/device.h>
@@ -387,16 +288,6 @@
static void sysctl_net_inet_ip_setup(struct sysctllog **);
/*
- * Compute IP limits derived from the value of nmbclusters.
- */
-static void
-ip_nmbclusters_changed(void)
-{
- ip_maxfrags = nmbclusters / 4;
- ip_nmbclusters = nmbclusters;
-}
-
-/*
* IP initialization: fill in IP protocol switch table.
* All protocols not implemented in kernel go to raw IP protocol handler.
*/
@@ -410,8 +301,6 @@
pool_init(&inmulti_pool, sizeof(struct in_multi), 0, 0, 0, "inmltpl",
NULL, IPL_SOFTNET);
- pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
- NULL, IPL_VM);
pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
if (pr == 0)
@@ -424,14 +313,12 @@
pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
ip_protox[pr->pr_protocol] = pr - inetsw;
- for (i = 0; i < IPREASS_NHASH; i++)
- LIST_INIT(&ipq[i]);
+ ip_reass_init();
ip_initid();
ip_id = time_second & 0xfffff;
ipintrq.ifq_maxlen = ipqmaxlen;
- ip_nmbclusters_changed();
TAILQ_INIT(&in_ifaddrhead);
in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, true,
@@ -515,16 +402,12 @@
ip_input(struct mbuf *m)
{
struct ip *ip = NULL;
- struct ipq *fp;
struct in_ifaddr *ia;
struct ifaddr *ifa;
- struct ipqent *ipqe;
- int hlen = 0, mff, len;
+ int hlen = 0, len;
int downmatch;
int checkif;
int srcrt = 0;
- int s;
- u_int hash;
#ifdef FAST_IPSEC
struct m_tag *mtag;
struct tdb_ident *tdbi;
@@ -924,13 +807,12 @@
ours:
/*
* If offset or IP_MF are set, must reassemble.
- * Otherwise, nothing need be done.
- * (We could look in the reassembly queue to see
- * if the packet was previously fragmented,
- * but it's not worth the time; just let them time out.)
*/
if (ip->ip_off & ~htons(IP_DF|IP_RF)) {
- u_int off;
+ struct ipq *fp;
+ u_int off, hash;
+ bool mff;
+
/*
* Prevent TCP blind data attacks by not allowing non-initial
* fragments to start at less than 68 bytes (minimal fragment
@@ -944,16 +826,16 @@
}
/*
- * Adjust ip_len to not reflect header,
- * set ipqe_mff if more fragments are expected,
- * convert offset of this to bytes.
+ * Adjust total IP length to not reflect header. Set 'mff'
+ * indicator, if more fragments are expected. Convert offset
+ * of this to bytes.
*/
ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
mff = (ip->ip_off & htons(IP_MF)) != 0;
if (mff) {
/*
* Make sure that fragments have a data length
- * that's a non-zero multiple of 8 bytes.
+ * which is non-zero and multiple of 8 bytes.
*/
if (ntohs(ip->ip_len) == 0 ||
(ntohs(ip->ip_len) & 0x7) != 0) {
@@ -963,29 +845,14 @@
}
ip->ip_off = htons((ntohs(ip->ip_off) & IP_OFFMASK) << 3);
- /*
- * Look for queue of fragments of this datagram.
- */
- IPQ_LOCK();
- hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
- LIST_FOREACH(fp, &ipq[hash], ipq_q) {
- if (ip->ip_id != fp->ipq_id)
- continue;
- if (!in_hosteq(ip->ip_src, fp->ipq_src))
- continue;
- if (!in_hosteq(ip->ip_dst, fp->ipq_dst))
- continue;
- if (ip->ip_p != fp->ipq_p)
- continue;
- /*
- * Make sure the TOS is matches previous fragments.
- */
- if (ip->ip_tos != fp->ipq_tos) {
- IP_STATINC(IP_STAT_BADFRAGS);
- IPQ_UNLOCK();
- goto bad;
- }
- break;
+ /* Look for queue of fragments of this datagram. */
+ fp = ip_reass_lookup(ip, &hash);
+
+ /* Make sure the TOS matches previous fragments. */
+ if (fp && fp->ipq_tos != ip->ip_tos) {
+ IP_STATINC(IP_STAT_BADFRAGS);
+ ip_reass_unlock();
+ goto bad;
}
/*
@@ -994,21 +861,19 @@
* attempt reassembly; if it succeeds, proceed.
*/
if (mff || ip->ip_off != htons(0)) {
- IP_STATINC(IP_STAT_FRAGMENTS);
- s = splvm();
- ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
- splx(s);
+ struct ipqent *ipqe;
+
+ ipqe = ip_reass_getent();
if (ipqe == NULL) {
IP_STATINC(IP_STAT_RCVMEMDROP);
- IPQ_UNLOCK();
+ ip_reass_unlock();
goto bad;
}
ipqe->ipqe_mff = mff;
ipqe->ipqe_m = m;
ipqe->ipqe_ip = ip;
- m = ip_reass(ipqe, fp, &ipq[hash]);
+ m = ip_reass(ipqe, fp, hash);
if (m == NULL) {
- IPQ_UNLOCK();
return;
}
IP_STATINC(IP_STAT_REASSEMBLED);
@@ -1017,8 +882,8 @@
ip->ip_len = htons(ntohs(ip->ip_len) + hlen);
} else if (fp) {
ip_freef(fp);
+ ip_reass_unlock();
}
- IPQ_UNLOCK();
}
#if defined(IPSEC)
@@ -1096,398 +961,30 @@
}
/*
- * Take incoming datagram fragment and try to
- * reassemble it into whole datagram. If a chain for
- * reassembly of this datagram already exists, then it
- * is given as fp; otherwise have to make a chain.
- */
-struct mbuf *
-ip_reass(struct ipqent *ipqe, struct ipq *fp, struct ipqhead *ipqhead)
-{
- struct mbuf *m = ipqe->ipqe_m;
- struct ipqent *nq, *p, *q;
- struct ip *ip;
- struct mbuf *t;
- int hlen = ipqe->ipqe_ip->ip_hl << 2;
- int i, next, s;
-
- IPQ_LOCK_CHECK();
-
- /*
- * Presence of header sizes in mbufs
- * would confuse code below.
- */
- m->m_data += hlen;
- m->m_len -= hlen;
-
-#ifdef notyet
- /* make sure fragment limit is up-to-date */
- CHECK_NMBCLUSTER_PARAMS();
-
- /* If we have too many fragments, drop the older half. */
- if (ip_nfrags >= ip_maxfrags)
- ip_reass_drophalf(void);
-#endif
-
- /*
- * We are about to add a fragment; increment frag count.
- */
- ip_nfrags++;
-
- /*
- * If first fragment to arrive, create a reassembly queue.
- */
- if (fp == 0) {
- /*
- * Enforce upper bound on number of fragmented packets
- * for which we attempt reassembly;
- * If maxfrag is 0, never accept fragments.
- * If maxfrag is -1, accept all fragments without limitation.
- */
- if (ip_maxfragpackets < 0)
- ;
- else if (ip_nfragpackets >= ip_maxfragpackets)
- goto dropfrag;
- ip_nfragpackets++;
- fp = malloc(sizeof (struct ipq), M_FTABLE, M_NOWAIT);
- if (fp == NULL)
- goto dropfrag;
- LIST_INSERT_HEAD(ipqhead, fp, ipq_q);
- fp->ipq_nfrags = 1;
- fp->ipq_ttl = IPFRAGTTL;
- fp->ipq_p = ipqe->ipqe_ip->ip_p;
- fp->ipq_id = ipqe->ipqe_ip->ip_id;
- fp->ipq_tos = ipqe->ipqe_ip->ip_tos;
- TAILQ_INIT(&fp->ipq_fragq);
- fp->ipq_src = ipqe->ipqe_ip->ip_src;
- fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
- p = NULL;
- goto insert;
- } else {
- fp->ipq_nfrags++;
- }
-
- /*
- * Find a segment which begins after this one does.
- */
- for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
- p = q, q = TAILQ_NEXT(q, ipqe_q))
- if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
- break;
-
- /*
- * If there is a preceding segment, it may provide some of
- * our data already. If so, drop the data from the incoming
- * segment. If it provides all of our data, drop us.
- */
- if (p != NULL) {
- i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
- ntohs(ipqe->ipqe_ip->ip_off);
- if (i > 0) {
- if (i >= ntohs(ipqe->ipqe_ip->ip_len))
- goto dropfrag;
- m_adj(ipqe->ipqe_m, i);
- ipqe->ipqe_ip->ip_off =
- htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
- ipqe->ipqe_ip->ip_len =
- htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
- }
- }
-
- /*
- * While we overlap succeeding segments trim them or,
- * if they are completely covered, dequeue them.
- */
- for (; q != NULL &&
- ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
- ntohs(q->ipqe_ip->ip_off); q = nq) {
- i = (ntohs(ipqe->ipqe_ip->ip_off) +
- ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
- if (i < ntohs(q->ipqe_ip->ip_len)) {
- q->ipqe_ip->ip_len =
- htons(ntohs(q->ipqe_ip->ip_len) - i);
- q->ipqe_ip->ip_off =
- htons(ntohs(q->ipqe_ip->ip_off) + i);
- m_adj(q->ipqe_m, i);
- break;
- }
- nq = TAILQ_NEXT(q, ipqe_q);
- m_freem(q->ipqe_m);
- TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
- s = splvm();
- pool_put(&ipqent_pool, q);
- splx(s);
- fp->ipq_nfrags--;
- ip_nfrags--;
- }
-
-insert:
- /*
- * Stick new segment in its place;
- * check for complete reassembly.
- */
- if (p == NULL) {
- TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
- } else {
- TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q);
- }
- next = 0;
- for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
- p = q, q = TAILQ_NEXT(q, ipqe_q)) {
- if (ntohs(q->ipqe_ip->ip_off) != next)
- return (0);
- next += ntohs(q->ipqe_ip->ip_len);
- }
- if (p->ipqe_mff)
- return (0);
-
- /*
- * Reassembly is complete. Check for a bogus message size and
- * concatenate fragments.
- */
- q = TAILQ_FIRST(&fp->ipq_fragq);
- ip = q->ipqe_ip;
- if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
- IP_STATINC(IP_STAT_TOOLONG);
- ip_freef(fp);
- return (0);
- }
- m = q->ipqe_m;
- t = m->m_next;
- m->m_next = 0;
- m_cat(m, t);
- nq = TAILQ_NEXT(q, ipqe_q);
- s = splvm();
- pool_put(&ipqent_pool, q);
- splx(s);
- for (q = nq; q != NULL; q = nq) {
- t = q->ipqe_m;
- nq = TAILQ_NEXT(q, ipqe_q);
- s = splvm();
- pool_put(&ipqent_pool, q);
- splx(s);
- m_cat(m, t);
- }
- ip_nfrags -= fp->ipq_nfrags;
-
- /*
- * Create header for new ip packet by
- * modifying header of first packet;
- * dequeue and discard fragment reassembly header.
- * Make header visible.
- */
- ip->ip_len = htons(next);
- ip->ip_src = fp->ipq_src;
- ip->ip_dst = fp->ipq_dst;
- LIST_REMOVE(fp, ipq_q);
- free(fp, M_FTABLE);
- ip_nfragpackets--;
- m->m_len += (ip->ip_hl << 2);
- m->m_data -= (ip->ip_hl << 2);
- /* some debugging cruft by sklower, below, will go away soon */
- if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
- int plen = 0;
- for (t = m; t; t = t->m_next)
- plen += t->m_len;
- m->m_pkthdr.len = plen;
- m->m_pkthdr.csum_flags = 0;
- }
- return (m);
-
-dropfrag:
- if (fp != 0)
- fp->ipq_nfrags--;
- ip_nfrags--;
- IP_STATINC(IP_STAT_FRAGDROPPED);
- m_freem(m);
- s = splvm();
- pool_put(&ipqent_pool, ipqe);
- splx(s);
- return (0);
-}
-
-/*
- * Free a fragment reassembly header and all
- * associated datagrams.
- */
-void
-ip_freef(struct ipq *fp)
-{
- struct ipqent *q, *p;
- u_int nfrags = 0;
- int s;
-
- IPQ_LOCK_CHECK();
-
- for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) {
- p = TAILQ_NEXT(q, ipqe_q);
- m_freem(q->ipqe_m);
- nfrags++;
- TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
- s = splvm();
- pool_put(&ipqent_pool, q);
- splx(s);
- }
-
- if (nfrags != fp->ipq_nfrags)
- printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags);
- ip_nfrags -= nfrags;
- LIST_REMOVE(fp, ipq_q);
- free(fp, M_FTABLE);
- ip_nfragpackets--;
-}
-
-/*
- * IP reassembly TTL machinery for multiplicative drop.
- */
-static u_int fragttl_histo[(IPFRAGTTL+1)];
-
-
-/*
- * Decrement TTL of all reasembly queue entries by `ticks'.
- * Count number of distinct fragments (as opposed to partial, fragmented
- * datagrams) in the reassembly queue. While we traverse the entire
- * reassembly queue, compute and return the median TTL over all fragments.
- */
-static u_int
-ip_reass_ttl_decr(u_int ticks)
-{
- u_int nfrags, median, dropfraction, keepfraction;
- struct ipq *fp, *nfp;
- int i;
-
- nfrags = 0;
- memset(fragttl_histo, 0, sizeof fragttl_histo);
-
- for (i = 0; i < IPREASS_NHASH; i++) {
- for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
- fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ?
- 0 : fp->ipq_ttl - ticks);
- nfp = LIST_NEXT(fp, ipq_q);
- if (fp->ipq_ttl == 0) {
- IP_STATINC(IP_STAT_FRAGTIMEOUT);
- ip_freef(fp);
- } else {
- nfrags += fp->ipq_nfrags;
- fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
- }
- }
- }
-
- KASSERT(ip_nfrags == nfrags);
-
- /* Find median (or other drop fraction) in histogram. */
- dropfraction = (ip_nfrags / 2);
- keepfraction = ip_nfrags - dropfraction;
- for (i = IPFRAGTTL, median = 0; i >= 0; i--) {
- median += fragttl_histo[i];
- if (median >= keepfraction)
- break;
- }
-
- /* Return TTL of median (or other fraction). */
- return (u_int)i;
-}
-
-void
-ip_reass_drophalf(void)
-{
-
- u_int median_ticks;
- /*
- * Compute median TTL of all fragments, and count frags
- * with that TTL or lower (roughly half of all fragments).
- */
- median_ticks = ip_reass_ttl_decr(0);
-
- /* Drop half. */
- median_ticks = ip_reass_ttl_decr(median_ticks);
-
-}
-
-/*
- * IP timer processing;
- * if a timer expires on a reassembly
- * queue, discard it.
+ * IP timer processing.
*/
void
ip_slowtimo(void)
{
- static u_int dropscanidx = 0;
- u_int i;
- u_int median_ttl;
mutex_enter(softnet_lock);
KERNEL_LOCK(1, NULL);
- IPQ_LOCK();
-
- /* Age TTL of all fragments by 1 tick .*/
- median_ttl = ip_reass_ttl_decr(1);
-
- /* make sure fragment limit is up-to-date */
- CHECK_NMBCLUSTER_PARAMS();
-
- /* If we have too many fragments, drop the older half. */
- if (ip_nfrags > ip_maxfrags)
- ip_reass_ttl_decr(median_ttl);
-
- /*
- * If we are over the maximum number of fragmented packets
- * (due to the limit being lowered), drain off
- * enough to get down to the new limit. Start draining
- * from the reassembly hashqueue most recently drained.
- */
- if (ip_maxfragpackets < 0)
- ;
- else {
- int wrapped = 0;
-
- i = dropscanidx;
- while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) {
- while (LIST_FIRST(&ipq[i]) != NULL)
- ip_freef(LIST_FIRST(&ipq[i]));
- if (++i >= IPREASS_NHASH) {
- i = 0;
- }
- /*
- * Dont scan forever even if fragment counters are
- * wrong: stop after scanning entire reassembly queue.
- */
- if (i == dropscanidx)
- wrapped = 1;
- }
- dropscanidx = i;
- }
- IPQ_UNLOCK();
+ ip_reass_slowtimo();
KERNEL_UNLOCK_ONE(NULL);
mutex_exit(softnet_lock);
}
/*
- * Drain off all datagram fragments. Don't acquire softnet_lock as
- * can be called from hardware interrupt context.
+ * IP drain processing.
*/
void
ip_drain(void)
{
KERNEL_LOCK(1, NULL);
-
- /*
- * We may be called from a device's interrupt context. If
- * the ipq is already busy, just bail out now.
- */
- if (ipq_lock_try() != 0) {
- /*
- * Drop half the total fragments now. If more mbufs are
- * needed, we will be called again soon.
- */
- ip_reass_drophalf();
- IPQ_UNLOCK();
- }
-
+ ip_reass_drain();
KERNEL_UNLOCK_ONE(NULL);
}
@@ -2430,14 +1927,6 @@
CTL_NET, PF_INET, IPPROTO_IP,
IPCTL_LOWPORTMAX, CTL_EOL);
#endif /* IPNOPRIVPORTS */
- sysctl_createv(clog, 0, NULL, NULL,
- CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
- CTLTYPE_INT, "maxfragpackets",
- SYSCTL_DESCR("Maximum number of fragments to retain for "
- "possible reassembly"),
- NULL, 0, &ip_maxfragpackets, 0,
- CTL_NET, PF_INET, IPPROTO_IP,
- IPCTL_MAXFRAGPACKETS, CTL_EOL);
#if NGRE > 0
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
Index: src/sys/netinet/ip_var.h
diff -u src/sys/netinet/ip_var.h:1.91 src/sys/netinet/ip_var.h:1.92
--- src/sys/netinet/ip_var.h:1.91 Sun Feb 1 17:04:11 2009
+++ src/sys/netinet/ip_var.h Tue Jul 13 22:16:10 2010
@@ -1,4 +1,4 @@
-/* $NetBSD: ip_var.h,v 1.91 2009/02/01 17:04:11 pooka Exp $ */
+/* $NetBSD: ip_var.h,v 1.92 2010/07/13 22:16:10 rmind Exp $ */
/*
* Copyright (c) 1982, 1986, 1993
@@ -198,7 +198,6 @@
extern int ip_hashsize;
#endif
extern struct pool inmulti_pool;
-extern struct pool ipqent_pool;
struct inpcb;
struct sockopt;
@@ -206,7 +205,6 @@
int ip_dooptions(struct mbuf *);
void ip_drain(void);
void ip_forward(struct mbuf *, int);
-void ip_freef(struct ipq *);
void ip_freemoptions(struct ip_moptions *);
int ip_getmoptions(struct ip_moptions *, struct sockopt *);
void ip_init(void);
@@ -215,8 +213,18 @@
int ip_output(struct mbuf *, ...);
int ip_fragment(struct mbuf *, struct ifnet *, u_long);
int ip_pcbopts(struct mbuf **, const struct sockopt *);
+
+struct ipq *
+ ip_reass_lookup(struct ip *, u_int *);
+void ip_reass_unlock(void);
+struct ipqent *
+ ip_reass_getent(void);
struct mbuf *
- ip_reass(struct ipqent *, struct ipq *, struct ipqhead *);
+ ip_reass(struct ipqent *, struct ipq *, u_int);
+void ip_reass_slowtimo(void);
+void ip_reass_drain(void);
+void ip_freef(struct ipq *);
+
struct in_ifaddr *
ip_rtaddr(struct in_addr);
void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
Index: src/sys/rump/net/lib/libnetinet/Makefile.inc
diff -u src/sys/rump/net/lib/libnetinet/Makefile.inc:1.5 src/sys/rump/net/lib/libnetinet/Makefile.inc:1.6
--- src/sys/rump/net/lib/libnetinet/Makefile.inc:1.5 Tue Feb 16 20:42:47 2010
+++ src/sys/rump/net/lib/libnetinet/Makefile.inc Tue Jul 13 22:16:10 2010
@@ -1,12 +1,12 @@
-# $NetBSD: Makefile.inc,v 1.5 2010/02/16 20:42:47 pooka Exp $
+# $NetBSD: Makefile.inc,v 1.6 2010/07/13 22:16:10 rmind Exp $
#
.PATH: ${.CURDIR}/../../../../netinet ${.CURDIR}/../../../../netinet6
# INET
SRCS+= in_proto.c igmp.c in.c in_offload.c in_pcb.c ip_icmp.c \
- ip_flow.c ip_id.c ip_input.c ip_output.c raw_ip.c in_cksum.c \
- cpu_in_cksum.c in4_cksum.c ip_encap.c
+ ip_flow.c ip_id.c ip_input.c ip_reass.c ip_output.c raw_ip.c \
+ in_cksum.c cpu_in_cksum.c in4_cksum.c ip_encap.c
# INET6
SRCS+= dest6.c frag6.c icmp6.c in6.c in6_cksum.c in6_ifattach.c \
Added files:
Index: src/sys/netinet/ip_reass.c
diff -u /dev/null src/sys/netinet/ip_reass.c:1.1
--- /dev/null Tue Jul 13 22:16:10 2010
+++ src/sys/netinet/ip_reass.c Tue Jul 13 22:16:10 2010
@@ -0,0 +1,677 @@
+/* $NetBSD: ip_reass.c,v 1.1 2010/07/13 22:16:10 rmind Exp $ */
+
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ip_input.c 8.2 (Berkeley) 1/4/94
+ */
+
+/*
+ * IP reassembly.
+ *
+ * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for IP
+ * reassembly queue buffer managment.
+ *
+ * We keep a count of total IP fragments (NB: not fragmented packets),
+ * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments.
+ * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the total
+ * fragments in reassembly queues. This AIMD policy avoids repeatedly
+ * deleting single packets under heavy fragmentation load (e.g., from lossy
+ * NFS peers).
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.1 2010/07/13 22:16:10 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <sys/pool.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_proto.h>
+#include <netinet/ip_private.h>
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+
+/*
+ * IP datagram reassembly hashed queues, pool, lock and counters.
+ */
+#define IPREASS_HASH_SHIFT 6
+#define IPREASS_HASH_SIZE (1 << IPREASS_HASH_SHIFT)
+#define IPREASS_HASH_MASK (IPREASS_HASH_SIZE - 1)
+#define IPREASS_HASH(x, y) \
+ (((((x) & 0xf) | ((((x) >> 8) & 0xf) << 4)) ^ (y)) & IPREASS_HASH_MASK)
+
+struct ipqhead ipq[IPREASS_HASH_SIZE];
+struct pool ipqent_pool;
+static int ipq_locked;
+
+static int ip_nfragpackets; /* packets in reass queue */
+static int ip_nfrags; /* total fragments in reass queues */
+
+static int ip_maxfragpackets; /* limit on packets. XXX sysctl */
+static int ip_maxfrags; /* limit on fragments. XXX sysctl */
+
+/*
+ * Cached copy of nmbclusters. If nbclusters is different,
+ * recalculate IP parameters derived from nmbclusters.
+ */
+static int ip_nmbclusters; /* copy of nmbclusters */
+
+/*
+ * IP reassembly TTL machinery for multiplicative drop.
+ */
+static u_int fragttl_histo[IPFRAGTTL + 1];
+
+void sysctl_ip_reass_setup(void);
+static void ip_nmbclusters_changed(void);
+static u_int ip_reass_ttl_decr(u_int ticks);
+static void ip_reass_drophalf(void);
+
+/*
+ * ip_reass_init:
+ *
+ * Initialization of IP reassembly mechanism.
+ */
+void
+ip_reass_init(void)
+{
+ int i;
+
+ pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
+ NULL, IPL_VM);
+
+ for (i = 0; i < IPREASS_HASH_SIZE; i++) {
+ LIST_INIT(&ipq[i]);
+ }
+ ip_maxfragpackets = 200;
+ ip_maxfrags = 0;
+ ip_nmbclusters_changed();
+
+ sysctl_ip_reass_setup();
+}
+
+static struct sysctllog *ip_reass_sysctllog;
+
+void
+sysctl_ip_reass_setup(void)
+{
+
+ sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "net", NULL,
+ NULL, 0, NULL, 0,
+ CTL_NET, CTL_EOL);
+ sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "inet",
+ SYSCTL_DESCR("PF_INET related settings"),
+ NULL, 0, NULL, 0,
+ CTL_NET, PF_INET, CTL_EOL);
+ sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT,
+ CTLTYPE_NODE, "ip",
+ SYSCTL_DESCR("IPv4 related settings"),
+ NULL, 0, NULL, 0,
+ CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL);
+
+ sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
+ CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+ CTLTYPE_INT, "maxfragpackets",
+ SYSCTL_DESCR("Maximum number of fragments to retain for "
+ "possible reassembly"),
+ NULL, 0, &ip_maxfragpackets, 0,
+ CTL_NET, PF_INET, IPPROTO_IP, IPCTL_MAXFRAGPACKETS, CTL_EOL);
+}
+
+#define CHECK_NMBCLUSTER_PARAMS() \
+do { \
+ if (__predict_false(ip_nmbclusters != nmbclusters)) \
+ ip_nmbclusters_changed(); \
+} while (/*CONSTCOND*/0)
+
+/*
+ * Compute IP limits derived from the value of nmbclusters.
+ */
+static void
+ip_nmbclusters_changed(void)
+{
+ ip_maxfrags = nmbclusters / 4;
+ ip_nmbclusters = nmbclusters;
+}
+
+static inline int ipq_lock_try(void);
+static inline void ipq_unlock(void);
+
+static inline int
+ipq_lock_try(void)
+{
+ int s;
+
+ /*
+ * Use splvm() -- we're blocking things that would cause
+ * mbuf allocation.
+ */
+ s = splvm();
+ if (ipq_locked) {
+ splx(s);
+ return (0);
+ }
+ ipq_locked = 1;
+ splx(s);
+ return (1);
+}
+
+static inline void
+ipq_unlock(void)
+{
+ int s;
+
+ s = splvm();
+ ipq_locked = 0;
+ splx(s);
+}
+
+#ifdef DIAGNOSTIC
+#define IPQ_LOCK() \
+do { \
+ if (ipq_lock_try() == 0) { \
+ printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \
+ panic("ipq_lock"); \
+ } \
+} while (/*CONSTCOND*/ 0)
+#define IPQ_LOCK_CHECK() \
+do { \
+ if (ipq_locked == 0) { \
+ printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \
+ panic("ipq lock check"); \
+ } \
+} while (/*CONSTCOND*/ 0)
+#else
+#define IPQ_LOCK() (void) ipq_lock_try()
+#define IPQ_LOCK_CHECK() /* nothing */
+#endif
+
+#define IPQ_UNLOCK() ipq_unlock()
+
+/*
+ * ip_reass_lookup:
+ *
+ * Look for queue of fragments of this datagram.
+ */
+struct ipq *
+ip_reass_lookup(struct ip *ip, u_int *hashp)
+{
+ struct ipq *fp;
+ u_int hash;
+
+ IPQ_LOCK();
+ hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
+ LIST_FOREACH(fp, &ipq[hash], ipq_q) {
+ if (ip->ip_id != fp->ipq_id)
+ continue;
+ if (!in_hosteq(ip->ip_src, fp->ipq_src))
+ continue;
+ if (!in_hosteq(ip->ip_dst, fp->ipq_dst))
+ continue;
+ if (ip->ip_p != fp->ipq_p)
+ continue;
+ break;
+ }
+ *hashp = hash;
+ return fp;
+}
+
+void
+ip_reass_unlock(void)
+{
+
+ IPQ_UNLOCK();
+}
+
+struct ipqent *
+ip_reass_getent(void)
+{
+ struct ipqent *ipqe;
+ int s;
+
+ IP_STATINC(IP_STAT_FRAGMENTS);
+ s = splvm();
+ ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
+ splx(s);
+
+ return ipqe;
+}
+
+/*
+ * ip_reass:
+ *
+ * Take incoming datagram fragment and try to reassemble it into whole
+ * datagram. If a chain for reassembly of this datagram already exists,
+ * then it is given as 'fp'; otherwise have to make a chain.
+ */
+struct mbuf *
+ip_reass(struct ipqent *ipqe, struct ipq *fp, u_int hash)
+{
+ struct ipqhead *ipqhead = &ipq[hash];
+ const int hlen = ipqe->ipqe_ip->ip_hl << 2;
+ struct mbuf *m = ipqe->ipqe_m, *t;
+ struct ipqent *nq, *p, *q;
+ struct ip *ip;
+ int i, next, s;
+
+ IPQ_LOCK_CHECK();
+
+ /*
+ * Presence of header sizes in mbufs would confuse code below.
+ */
+ m->m_data += hlen;
+ m->m_len -= hlen;
+
+#ifdef notyet
+ /* Make sure fragment limit is up-to-date. */
+ CHECK_NMBCLUSTER_PARAMS();
+
+ /* If we have too many fragments, drop the older half. */
+ if (ip_nfrags >= ip_maxfrags) {
+ ip_reass_drophalf(void);
+ }
+#endif
+
+ /*
+ * We are about to add a fragment; increment frag count.
+ */
+ ip_nfrags++;
+
+ /*
+ * If first fragment to arrive, create a reassembly queue.
+ */
+ if (fp == NULL) {
+ /*
+ * Enforce upper bound on number of fragmented packets
+ * for which we attempt reassembly: a) if maxfrag is 0,
+ * never accept fragments b) if maxfrag is -1, accept
+ * all fragments without limitation.
+ */
+ if (ip_maxfragpackets < 0)
+ ;
+ else if (ip_nfragpackets >= ip_maxfragpackets) {
+ goto dropfrag;
+ }
+ ip_nfragpackets++;
+ fp = malloc(sizeof(struct ipq), M_FTABLE, M_NOWAIT);
+ if (fp == NULL) {
+ goto dropfrag;
+ }
+ LIST_INSERT_HEAD(ipqhead, fp, ipq_q);
+ fp->ipq_nfrags = 1;
+ fp->ipq_ttl = IPFRAGTTL;
+ fp->ipq_p = ipqe->ipqe_ip->ip_p;
+ fp->ipq_id = ipqe->ipqe_ip->ip_id;
+ fp->ipq_tos = ipqe->ipqe_ip->ip_tos;
+ TAILQ_INIT(&fp->ipq_fragq);
+ fp->ipq_src = ipqe->ipqe_ip->ip_src;
+ fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
+ p = NULL;
+ goto insert;
+ } else {
+ fp->ipq_nfrags++;
+ }
+
+ /*
+ * Find a segment which begins after this one does.
+ */
+ for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
+ p = q, q = TAILQ_NEXT(q, ipqe_q))
+ if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
+ break;
+
+ /*
+ * If there is a preceding segment, it may provide some of our
+ * data already. If so, drop the data from the incoming segment.
+ * If it provides all of our data, drop us.
+ */
+ if (p != NULL) {
+ i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
+ ntohs(ipqe->ipqe_ip->ip_off);
+ if (i > 0) {
+ if (i >= ntohs(ipqe->ipqe_ip->ip_len)) {
+ goto dropfrag;
+ }
+ m_adj(ipqe->ipqe_m, i);
+ ipqe->ipqe_ip->ip_off =
+ htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
+ ipqe->ipqe_ip->ip_len =
+ htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
+ }
+ }
+
+ /*
+ * While we overlap succeeding segments trim them or, if they are
+ * completely covered, dequeue them.
+ */
+ for (; q != NULL &&
+ ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
+ ntohs(q->ipqe_ip->ip_off); q = nq) {
+ i = (ntohs(ipqe->ipqe_ip->ip_off) +
+ ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
+ if (i < ntohs(q->ipqe_ip->ip_len)) {
+ q->ipqe_ip->ip_len =
+ htons(ntohs(q->ipqe_ip->ip_len) - i);
+ q->ipqe_ip->ip_off =
+ htons(ntohs(q->ipqe_ip->ip_off) + i);
+ m_adj(q->ipqe_m, i);
+ break;
+ }
+ nq = TAILQ_NEXT(q, ipqe_q);
+ m_freem(q->ipqe_m);
+ TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
+ s = splvm();
+ pool_put(&ipqent_pool, q);
+ splx(s);
+ fp->ipq_nfrags--;
+ ip_nfrags--;
+ }
+
+insert:
+ /*
+ * Stick new segment in its place; check for complete reassembly.
+ */
+ if (p == NULL) {
+ TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
+ } else {
+ TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q);
+ }
+ next = 0;
+ for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
+ p = q, q = TAILQ_NEXT(q, ipqe_q)) {
+ if (ntohs(q->ipqe_ip->ip_off) != next) {
+ IPQ_UNLOCK();
+ return NULL;
+ }
+ next += ntohs(q->ipqe_ip->ip_len);
+ }
+ if (p->ipqe_mff) {
+ IPQ_UNLOCK();
+ return NULL;
+ }
+ /*
+ * Reassembly is complete. Check for a bogus message size and
+ * concatenate fragments.
+ */
+ q = TAILQ_FIRST(&fp->ipq_fragq);
+ ip = q->ipqe_ip;
+ if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
+ IP_STATINC(IP_STAT_TOOLONG);
+ ip_freef(fp);
+ IPQ_UNLOCK();
+ return NULL;
+ }
+ m = q->ipqe_m;
+ t = m->m_next;
+ m->m_next = NULL;
+ m_cat(m, t);
+ nq = TAILQ_NEXT(q, ipqe_q);
+ s = splvm();
+ pool_put(&ipqent_pool, q);
+ splx(s);
+ for (q = nq; q != NULL; q = nq) {
+ t = q->ipqe_m;
+ nq = TAILQ_NEXT(q, ipqe_q);
+ s = splvm();
+ pool_put(&ipqent_pool, q);
+ splx(s);
+ m_cat(m, t);
+ }
+ ip_nfrags -= fp->ipq_nfrags;
+
+ /*
+ * Create header for new packet by modifying header of first
+ * packet. Dequeue and discard fragment reassembly header. Make
+ * header visible.
+ */
+ ip->ip_len = htons(next);
+ ip->ip_src = fp->ipq_src;
+ ip->ip_dst = fp->ipq_dst;
+ LIST_REMOVE(fp, ipq_q);
+ free(fp, M_FTABLE);
+ ip_nfragpackets--;
+ m->m_len += (ip->ip_hl << 2);
+ m->m_data -= (ip->ip_hl << 2);
+ /* some debugging cruft by sklower, below, will go away soon */
+ if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
+ int plen = 0;
+ for (t = m; t; t = t->m_next) {
+ plen += t->m_len;
+ }
+ m->m_pkthdr.len = plen;
+ m->m_pkthdr.csum_flags = 0;
+ }
+ IPQ_UNLOCK();
+ return m;
+
+dropfrag:
+ if (fp != NULL) {
+ fp->ipq_nfrags--;
+ }
+ ip_nfrags--;
+ IP_STATINC(IP_STAT_FRAGDROPPED);
+ m_freem(m);
+ s = splvm();
+ pool_put(&ipqent_pool, ipqe);
+ splx(s);
+ IPQ_UNLOCK();
+ return NULL;
+}
+
+/*
+ * ip_freef:
+ *
+ * Free a fragment reassembly header and all associated datagrams.
+ */
+void
+ip_freef(struct ipq *fp)
+{
+ struct ipqent *q, *p;
+ u_int nfrags = 0;
+ int s;
+
+ IPQ_LOCK_CHECK();
+
+ for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) {
+ p = TAILQ_NEXT(q, ipqe_q);
+ m_freem(q->ipqe_m);
+ nfrags++;
+ TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
+ s = splvm();
+ pool_put(&ipqent_pool, q);
+ splx(s);
+ }
+
+ if (nfrags != fp->ipq_nfrags) {
+ printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags);
+ }
+ ip_nfrags -= nfrags;
+ LIST_REMOVE(fp, ipq_q);
+ free(fp, M_FTABLE);
+ ip_nfragpackets--;
+}
+
+/*
+ * ip_reass_ttl_decr:
+ *
+ * Decrement TTL of all reasembly queue entries by `ticks'. Count
+ * number of distinct fragments (as opposed to partial, fragmented
+ * datagrams) inthe reassembly queue. While we traverse the entire
+ * reassembly queue, compute and return the median TTL over all
+ * fragments.
+ */
+static u_int
+ip_reass_ttl_decr(u_int ticks)
+{
+ u_int nfrags, median, dropfraction, keepfraction;
+ struct ipq *fp, *nfp;
+ int i;
+
+ nfrags = 0;
+ memset(fragttl_histo, 0, sizeof(fragttl_histo));
+
+ for (i = 0; i < IPREASS_HASH_SIZE; i++) {
+ for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
+ fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ?
+ 0 : fp->ipq_ttl - ticks);
+ nfp = LIST_NEXT(fp, ipq_q);
+ if (fp->ipq_ttl == 0) {
+ IP_STATINC(IP_STAT_FRAGTIMEOUT);
+ ip_freef(fp);
+ } else {
+ nfrags += fp->ipq_nfrags;
+ fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
+ }
+ }
+ }
+
+ KASSERT(ip_nfrags == nfrags);
+
+ /* Find median (or other drop fraction) in histogram. */
+ dropfraction = (ip_nfrags / 2);
+ keepfraction = ip_nfrags - dropfraction;
+ for (i = IPFRAGTTL, median = 0; i >= 0; i--) {
+ median += fragttl_histo[i];
+ if (median >= keepfraction)
+ break;
+ }
+
+ /* Return TTL of median (or other fraction). */
+ return (u_int)i;
+}
+
+static void
+ip_reass_drophalf(void)
+{
+ u_int median_ticks;
+
+ /*
+ * Compute median TTL of all fragments, and count frags
+ * with that TTL or lower (roughly half of all fragments).
+ */
+ median_ticks = ip_reass_ttl_decr(0);
+
+ /* Drop half. */
+ median_ticks = ip_reass_ttl_decr(median_ticks);
+}
+
+/*
+ * ip_reass_drain: drain off all datagram fragments. Do not acquire
+ * softnet_lock as can be called from hardware interrupt context.
+ */
+void
+ip_reass_drain(void)
+{
+
+ /*
+ * We may be called from a device's interrupt context. If
+ * the ipq is already busy, just bail out now.
+ */
+ if (ipq_lock_try() != 0) {
+ /*
+ * Drop half the total fragments now. If more mbufs are
+ * needed, we will be called again soon.
+ */
+ ip_reass_drophalf();
+ IPQ_UNLOCK();
+ }
+}
+
+/*
+ * ip_reass_slowtimo:
+ *
+ * If a timer expires on a reassembly queue, discard it.
+ */
+void
+ip_reass_slowtimo(void)
+{
+ static u_int dropscanidx = 0;
+ u_int i, median_ttl;
+
+ IPQ_LOCK();
+
+ /* Age TTL of all fragments by 1 tick .*/
+ median_ttl = ip_reass_ttl_decr(1);
+
+ /* Make sure fragment limit is up-to-date. */
+ CHECK_NMBCLUSTER_PARAMS();
+
+ /* If we have too many fragments, drop the older half. */
+ if (ip_nfrags > ip_maxfrags) {
+ ip_reass_ttl_decr(median_ttl);
+ }
+
+ /*
+ * If we are over the maximum number of fragmented packets (due to
+ * the limit being lowered), drain off enough to get down to the
+ * new limit. Start draining from the reassembly hashqueue most
+ * recently drained.
+ */
+ if (ip_maxfragpackets < 0)
+ ;
+ else {
+ int wrapped = 0;
+
+ i = dropscanidx;
+ while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) {
+ while (LIST_FIRST(&ipq[i]) != NULL) {
+ ip_freef(LIST_FIRST(&ipq[i]));
+ }
+ if (++i >= IPREASS_HASH_SIZE) {
+ i = 0;
+ }
+ /*
+ * Do not scan forever even if fragment counters are
+ * wrong: stop after scanning entire reassembly queue.
+ */
+ if (i == dropscanidx) {
+ wrapped = 1;
+ }
+ }
+ dropscanidx = i;
+ }
+ IPQ_UNLOCK();
+}