Split-off IPv4 re-assembly mechanism into a separate module. Abstract

into ip_reass_init(), ip_reass_lookup(), etc (note: abstraction is not yet complete). No functional changes to the actual mechanism. OK matt@
2010-07-13 22:16:10 +00:00 · 2010-07-13 22:16:10 +00:00 · bcc65ff09f
parent 29dd668442
commit bcc65ff09f
6 changed files with 726 additions and 550 deletions
--- a/sys/netinet/files.netinet
+++ b/sys/netinet/files.netinet
@ -1,4 +1,4 @@
-#	$NetBSD: files.netinet,v 1.20 2008/01/25 21:12:14 joerg Exp $
+#	$NetBSD: files.netinet,v 1.21 2010/07/13 22:16:10 rmind Exp $

 defflag opt_tcp_debug.h		TCP_DEBUG
 defparam opt_tcp_debug.h	TCP_NDEBUG
@ -29,6 +29,7 @@ file	netinet/ip_id.c		inet
 file	netinet/ip_input.c	inet
 file	netinet/ip_mroute.c	inet & mrouting
 file	netinet/ip_output.c	inet
+file	netinet/ip_reass.c	inet
 file	netinet/raw_ip.c	inet

 file	netinet/tcp_debug.c	(inet | inet6) & tcp_debug
--- a/sys/netinet/in_var.h
+++ b/sys/netinet/in_var.h
@ -1,4 +1,4 @@
-/*	$NetBSD: in_var.h,v 1.62 2008/04/28 20:24:09 martin Exp $	*/
+/*	$NetBSD: in_var.h,v 1.63 2010/07/13 22:16:10 rmind Exp $	*/

 /*-
 * Copyright (c) 1998 The NetBSD Foundation, Inc.
@ -300,6 +300,7 @@ int	in_control(struct socket *, u_long, void *, struct ifnet *,
 	    struct lwp *);
 void	in_purgeaddr(struct ifaddr *);
 void	in_purgeif(struct ifnet *);
+void	ip_reass_init(void);
 void	ip_input(struct mbuf *);
 int	ipflow_fastforward(struct mbuf *);
 void	ip_initid(void);
--- a/sys/netinet/ip_input.c
+++ b/sys/netinet/ip_input.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ip_input.c,v 1.287 2010/07/09 18:42:46 rmind Exp $	*/
+/*	$NetBSD: ip_input.c,v 1.288 2010/07/13 22:16:10 rmind Exp $	*/

 /*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@ -91,7 +91,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.287 2010/07/09 18:42:46 rmind Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.288 2010/07/13 22:16:10 rmind Exp $");

 #include "opt_inet.h"
 #include "opt_compat_netbsd.h"
@ -104,7 +104,6 @@ __KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.287 2010/07/09 18:42:46 rmind Exp $")

 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
@ -240,105 +239,7 @@ percpu_t *ipstat_percpu;
 struct pfil_head inet_pfil_hook;
 #endif

-/*
- * Cached copy of nmbclusters. If nbclusters is different,
- * recalculate IP parameters derived from nmbclusters.
- */
-static int	ip_nmbclusters;			/* copy of nmbclusters */
-static void	ip_nmbclusters_changed(void);	/* recalc limits */
-
-#define CHECK_NMBCLUSTER_PARAMS()				\
-do {								\
-	if (__predict_false(ip_nmbclusters != nmbclusters))	\
-		ip_nmbclusters_changed();			\
-} while (/*CONSTCOND*/0)
-
-/* IP datagram reassembly queues (hashed) */
-#define IPREASS_NHASH_LOG2      6
-#define IPREASS_NHASH           (1 << IPREASS_NHASH_LOG2)
-#define IPREASS_HMASK           (IPREASS_NHASH - 1)
-#define IPREASS_HASH(x,y) \
-	(((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
-struct ipqhead ipq[IPREASS_NHASH];
-int	ipq_locked;
-static int	ip_nfragpackets;	/* packets in reass queue */
-static int	ip_nfrags;		/* total fragments in reass queues */
-
-int	ip_maxfragpackets = 200;	/* limit on packets. XXX sysctl */
-int	ip_maxfrags;		        /* limit on fragments. XXX sysctl */
-
-
-/*
- * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for
- * IP reassembly queue buffer managment.
- *
- * We keep a count of total IP fragments (NB: not fragmented packets!)
- * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments.
- * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the
- * total fragments in  reassembly queues.This AIMD policy avoids
- * repeatedly deleting single packets under heavy fragmentation load
- * (e.g., from lossy NFS peers).
- */
-static u_int	ip_reass_ttl_decr(u_int ticks);
-static void	ip_reass_drophalf(void);
-
-
-static inline int ipq_lock_try(void);
-static inline void ipq_unlock(void);
-
-static inline int
-ipq_lock_try(void)
-{
-	int s;
-
-	/*
-	 * Use splvm() -- we're blocking things that would cause
-	 * mbuf allocation.
-	 */
-	s = splvm();
-	if (ipq_locked) {
-		splx(s);
-		return (0);
-	}
-	ipq_locked = 1;
-	splx(s);
-	return (1);
-}
-
-static inline void
-ipq_unlock(void)
-{
-	int s;
-
-	s = splvm();
-	ipq_locked = 0;
-	splx(s);
-}
-
-#ifdef DIAGNOSTIC
-#define	IPQ_LOCK()							\
-do {									\
-	if (ipq_lock_try() == 0) {					\
-		printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \
-		panic("ipq_lock");					\
-	}								\
-} while (/*CONSTCOND*/ 0)
-#define	IPQ_LOCK_CHECK()						\
-do {									\
-	if (ipq_locked == 0) {						\
-		printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \
-		panic("ipq lock check");				\
-	}								\
-} while (/*CONSTCOND*/ 0)
-#else
-#define	IPQ_LOCK()		(void) ipq_lock_try()
-#define	IPQ_LOCK_CHECK()	/* nothing */
-#endif
-
-#define	IPQ_UNLOCK()		ipq_unlock()
-
 struct pool inmulti_pool;
-struct pool ipqent_pool;

 #ifdef INET_CSUM_COUNTERS
 #include <sys/device.h>
@ -386,16 +287,6 @@ struct mowner ip_tx_mowner = MOWNER_INIT("internet", "tx");

 static void sysctl_net_inet_ip_setup(struct sysctllog **);

-/*
- * Compute IP limits derived from the value of nmbclusters.
- */
-static void
-ip_nmbclusters_changed(void)
-{
-	ip_maxfrags = nmbclusters / 4;
-	ip_nmbclusters =  nmbclusters;
-}
-
 /*
 * IP initialization: fill in IP protocol switch table.
 * All protocols not implemented in kernel go to raw IP protocol handler.
@ -410,8 +301,6 @@ ip_init(void)

 	pool_init(&inmulti_pool, sizeof(struct in_multi), 0, 0, 0, "inmltpl",
 	    NULL, IPL_SOFTNET);
-	pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
-	    NULL, IPL_VM);

 	pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
 	if (pr == 0)
@ -424,14 +313,12 @@ ip_init(void)
 		    pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
 			ip_protox[pr->pr_protocol] = pr - inetsw;

-	for (i = 0; i < IPREASS_NHASH; i++)
-	    	LIST_INIT(&ipq[i]);
+	ip_reass_init();

 	ip_initid();
 	ip_id = time_second & 0xfffff;

 	ipintrq.ifq_maxlen = ipqmaxlen;
-	ip_nmbclusters_changed();

 	TAILQ_INIT(&in_ifaddrhead);
 	in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, true,
@ -515,16 +402,12 @@ void
 ip_input(struct mbuf *m)
 {
 	struct ip *ip = NULL;
-	struct ipq *fp;
 	struct in_ifaddr *ia;
 	struct ifaddr *ifa;
-	struct ipqent *ipqe;
-	int hlen = 0, mff, len;
+	int hlen = 0, len;
 	int downmatch;
 	int checkif;
 	int srcrt = 0;
-	int s;
-	u_int hash;
 #ifdef FAST_IPSEC
 	struct m_tag *mtag;
 	struct tdb_ident *tdbi;
@ -924,13 +807,12 @@ ip_input(struct mbuf *m)
 ours:
 	/*
 	 * If offset or IP_MF are set, must reassemble.
-	 * Otherwise, nothing need be done.
-	 * (We could look in the reassembly queue to see
-	 * if the packet was previously fragmented,
-	 * but it's not worth the time; just let them time out.)
 	 */
 	if (ip->ip_off & ~htons(IP_DF|IP_RF)) {
-		u_int off;
+		struct ipq *fp;
+		u_int off, hash;
+		bool mff;
+
 		/*
 		 * Prevent TCP blind data attacks by not allowing non-initial
 		 * fragments to start at less than 68 bytes (minimal fragment
@ -944,16 +826,16 @@ ours:
 		}

 		/*
-		 * Adjust ip_len to not reflect header,
-		 * set ipqe_mff if more fragments are expected,
-		 * convert offset of this to bytes.
+		 * Adjust total IP length to not reflect header.  Set 'mff'
+		 * indicator, if more fragments are expected.  Convert offset
+		 * of this to bytes.
 		 */
 		ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
 		mff = (ip->ip_off & htons(IP_MF)) != 0;
 		if (mff) {
 			/*
 			 * Make sure that fragments have a data length
-			 * that's a non-zero multiple of 8 bytes.
+			 * which is non-zero and multiple of 8 bytes.
 			 */
 			if (ntohs(ip->ip_len) == 0 ||
 			    (ntohs(ip->ip_len) & 0x7) != 0) {
@ -963,29 +845,14 @@ ours:
 		}
 		ip->ip_off = htons((ntohs(ip->ip_off) & IP_OFFMASK) << 3);

-		/*
-		 * Look for queue of fragments of this datagram.
-		 */
-		IPQ_LOCK();
-		hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
-		LIST_FOREACH(fp, &ipq[hash], ipq_q) {
-			if (ip->ip_id != fp->ipq_id)
-				continue;
-			if (!in_hosteq(ip->ip_src, fp->ipq_src))
-				continue;
-			if (!in_hosteq(ip->ip_dst, fp->ipq_dst))
-				continue;
-			if (ip->ip_p != fp->ipq_p)
-				continue;
-			/*
-			 * Make sure the TOS is matches previous fragments.
-			 */
-			if (ip->ip_tos != fp->ipq_tos) {
-				IP_STATINC(IP_STAT_BADFRAGS);
-				IPQ_UNLOCK();
-				goto bad;
-			}
-			break;
+		/* Look for queue of fragments of this datagram. */
+		fp = ip_reass_lookup(ip, &hash);
+
+		/* Make sure the TOS matches previous fragments. */
+		if (fp && fp->ipq_tos != ip->ip_tos) {
+			IP_STATINC(IP_STAT_BADFRAGS);
+			ip_reass_unlock();
+			goto bad;
 		}

 		/*
@ -994,21 +861,19 @@ ours:
 		 * attempt reassembly; if it succeeds, proceed.
 		 */
 		if (mff || ip->ip_off != htons(0)) {
-			IP_STATINC(IP_STAT_FRAGMENTS);
-			s = splvm();
-			ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
-			splx(s);
+			struct ipqent *ipqe;
+
+			ipqe = ip_reass_getent();
 			if (ipqe == NULL) {
 				IP_STATINC(IP_STAT_RCVMEMDROP);
-				IPQ_UNLOCK();
+				ip_reass_unlock();
 				goto bad;
 			}
 			ipqe->ipqe_mff = mff;
 			ipqe->ipqe_m = m;
 			ipqe->ipqe_ip = ip;
-			m = ip_reass(ipqe, fp, &ipq[hash]);
+			m = ip_reass(ipqe, fp, hash);
 			if (m == NULL) {
-				IPQ_UNLOCK();
 				return;
 			}
 			IP_STATINC(IP_STAT_REASSEMBLED);
@ -1017,8 +882,8 @@ ours:
 			ip->ip_len = htons(ntohs(ip->ip_len) + hlen);
 		} else if (fp) {
 			ip_freef(fp);
+			ip_reass_unlock();
 		}
-		IPQ_UNLOCK();
 	}

 #if defined(IPSEC)
@ -1096,398 +961,30 @@ badcsum:
 }

 /*
- * Take incoming datagram fragment and try to
- * reassemble it into whole datagram.  If a chain for
- * reassembly of this datagram already exists, then it
- * is given as fp; otherwise have to make a chain.
- */
-struct mbuf *
-ip_reass(struct ipqent *ipqe, struct ipq *fp, struct ipqhead *ipqhead)
-{
-	struct mbuf *m = ipqe->ipqe_m;
-	struct ipqent *nq, *p, *q;
-	struct ip *ip;
-	struct mbuf *t;
-	int hlen = ipqe->ipqe_ip->ip_hl << 2;
-	int i, next, s;
-
-	IPQ_LOCK_CHECK();
-
-	/*
-	 * Presence of header sizes in mbufs
-	 * would confuse code below.
-	 */
-	m->m_data += hlen;
-	m->m_len -= hlen;
-
-#ifdef	notyet
-	/* make sure fragment limit is up-to-date */
-	CHECK_NMBCLUSTER_PARAMS();
-
-	/* If we have too many fragments, drop the older half. */
-	if (ip_nfrags >= ip_maxfrags)
-		ip_reass_drophalf(void);
-#endif
-
-	/*
-	 * We are about to add a fragment; increment frag count.
-	 */
-	ip_nfrags++;
-
-	/*
-	 * If first fragment to arrive, create a reassembly queue.
-	 */
-	if (fp == 0) {
-		/*
-		 * Enforce upper bound on number of fragmented packets
-		 * for which we attempt reassembly;
-		 * If maxfrag is 0, never accept fragments.
-		 * If maxfrag is -1, accept all fragments without limitation.
-		 */
-		if (ip_maxfragpackets < 0)
-			;
-		else if (ip_nfragpackets >= ip_maxfragpackets)
-			goto dropfrag;
-		ip_nfragpackets++;
-		fp = malloc(sizeof (struct ipq), M_FTABLE, M_NOWAIT);
-		if (fp == NULL)
-			goto dropfrag;
-		LIST_INSERT_HEAD(ipqhead, fp, ipq_q);
-		fp->ipq_nfrags = 1;
-		fp->ipq_ttl = IPFRAGTTL;
-		fp->ipq_p = ipqe->ipqe_ip->ip_p;
-		fp->ipq_id = ipqe->ipqe_ip->ip_id;
-		fp->ipq_tos = ipqe->ipqe_ip->ip_tos;
-		TAILQ_INIT(&fp->ipq_fragq);
-		fp->ipq_src = ipqe->ipqe_ip->ip_src;
-		fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
-		p = NULL;
-		goto insert;
-	} else {
-		fp->ipq_nfrags++;
-	}
-
-	/*
-	 * Find a segment which begins after this one does.
-	 */
-	for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
-	    p = q, q = TAILQ_NEXT(q, ipqe_q))
-		if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
-			break;
-
-	/*
-	 * If there is a preceding segment, it may provide some of
-	 * our data already.  If so, drop the data from the incoming
-	 * segment.  If it provides all of our data, drop us.
-	 */
-	if (p != NULL) {
-		i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
-		    ntohs(ipqe->ipqe_ip->ip_off);
-		if (i > 0) {
-			if (i >= ntohs(ipqe->ipqe_ip->ip_len))
-				goto dropfrag;
-			m_adj(ipqe->ipqe_m, i);
-			ipqe->ipqe_ip->ip_off =
-			    htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
-			ipqe->ipqe_ip->ip_len =
-			    htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
-		}
-	}
-
-	/*
-	 * While we overlap succeeding segments trim them or,
-	 * if they are completely covered, dequeue them.
-	 */
-	for (; q != NULL &&
-	    ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
-	    ntohs(q->ipqe_ip->ip_off); q = nq) {
-		i = (ntohs(ipqe->ipqe_ip->ip_off) +
-		    ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
-		if (i < ntohs(q->ipqe_ip->ip_len)) {
-			q->ipqe_ip->ip_len =
-			    htons(ntohs(q->ipqe_ip->ip_len) - i);
-			q->ipqe_ip->ip_off =
-			    htons(ntohs(q->ipqe_ip->ip_off) + i);
-			m_adj(q->ipqe_m, i);
-			break;
-		}
-		nq = TAILQ_NEXT(q, ipqe_q);
-		m_freem(q->ipqe_m);
-		TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
-		s = splvm();
-		pool_put(&ipqent_pool, q);
-		splx(s);
-		fp->ipq_nfrags--;
-		ip_nfrags--;
-	}
-
-insert:
-	/*
-	 * Stick new segment in its place;
-	 * check for complete reassembly.
-	 */
-	if (p == NULL) {
-		TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
-	} else {
-		TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q);
-	}
-	next = 0;
-	for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
-	    p = q, q = TAILQ_NEXT(q, ipqe_q)) {
-		if (ntohs(q->ipqe_ip->ip_off) != next)
-			return (0);
-		next += ntohs(q->ipqe_ip->ip_len);
-	}
-	if (p->ipqe_mff)
-		return (0);
-
-	/*
-	 * Reassembly is complete.  Check for a bogus message size and
-	 * concatenate fragments.
-	 */
-	q = TAILQ_FIRST(&fp->ipq_fragq);
-	ip = q->ipqe_ip;
-	if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
-		IP_STATINC(IP_STAT_TOOLONG);
-		ip_freef(fp);
-		return (0);
-	}
-	m = q->ipqe_m;
-	t = m->m_next;
-	m->m_next = 0;
-	m_cat(m, t);
-	nq = TAILQ_NEXT(q, ipqe_q);
-	s = splvm();
-	pool_put(&ipqent_pool, q);
-	splx(s);
-	for (q = nq; q != NULL; q = nq) {
-		t = q->ipqe_m;
-		nq = TAILQ_NEXT(q, ipqe_q);
-		s = splvm();
-		pool_put(&ipqent_pool, q);
-		splx(s);
-		m_cat(m, t);
-	}
-	ip_nfrags -= fp->ipq_nfrags;
-
-	/*
-	 * Create header for new ip packet by
-	 * modifying header of first packet;
-	 * dequeue and discard fragment reassembly header.
-	 * Make header visible.
-	 */
-	ip->ip_len = htons(next);
-	ip->ip_src = fp->ipq_src;
-	ip->ip_dst = fp->ipq_dst;
-	LIST_REMOVE(fp, ipq_q);
-	free(fp, M_FTABLE);
-	ip_nfragpackets--;
-	m->m_len += (ip->ip_hl << 2);
-	m->m_data -= (ip->ip_hl << 2);
-	/* some debugging cruft by sklower, below, will go away soon */
-	if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
-		int plen = 0;
-		for (t = m; t; t = t->m_next)
-			plen += t->m_len;
-		m->m_pkthdr.len = plen;
-		m->m_pkthdr.csum_flags = 0;
-	}
-	return (m);
-
-dropfrag:
-	if (fp != 0)
-		fp->ipq_nfrags--;
-	ip_nfrags--;
-	IP_STATINC(IP_STAT_FRAGDROPPED);
-	m_freem(m);
-	s = splvm();
-	pool_put(&ipqent_pool, ipqe);
-	splx(s);
-	return (0);
-}
-
-/*
- * Free a fragment reassembly header and all
- * associated datagrams.
- */
-void
-ip_freef(struct ipq *fp)
-{
-	struct ipqent *q, *p;
-	u_int nfrags = 0;
-	int s;
-
-	IPQ_LOCK_CHECK();
-
-	for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) {
-		p = TAILQ_NEXT(q, ipqe_q);
-		m_freem(q->ipqe_m);
-		nfrags++;
-		TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
-		s = splvm();
-		pool_put(&ipqent_pool, q);
-		splx(s);
-	}
-
-	if (nfrags != fp->ipq_nfrags)
-	    printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags);
-	ip_nfrags -= nfrags;
-	LIST_REMOVE(fp, ipq_q);
-	free(fp, M_FTABLE);
-	ip_nfragpackets--;
-}
-
-/*
- * IP reassembly TTL machinery for  multiplicative drop.
- */
-static u_int	fragttl_histo[(IPFRAGTTL+1)];
-
-
-/*
- * Decrement TTL of all reasembly queue entries by `ticks'.
- * Count number of distinct fragments (as opposed to partial, fragmented
- * datagrams) in the reassembly queue.  While we  traverse the entire
- * reassembly queue, compute and return the median TTL over all fragments.
- */
-static u_int
-ip_reass_ttl_decr(u_int ticks)
-{
-	u_int nfrags, median, dropfraction, keepfraction;
-	struct ipq *fp, *nfp;
-	int i;
-
-	nfrags = 0;
-	memset(fragttl_histo, 0, sizeof fragttl_histo);
-
-	for (i = 0; i < IPREASS_NHASH; i++) {
-		for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
-			fp->ipq_ttl = ((fp->ipq_ttl  <= ticks) ?
-				       0 : fp->ipq_ttl - ticks);
-			nfp = LIST_NEXT(fp, ipq_q);
-			if (fp->ipq_ttl == 0) {
-				IP_STATINC(IP_STAT_FRAGTIMEOUT);
-				ip_freef(fp);
-			} else {
-				nfrags += fp->ipq_nfrags;
-				fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
-			}
-		}
-	}
-
-	KASSERT(ip_nfrags == nfrags);
-
-	/* Find median (or other drop fraction) in histogram. */
-	dropfraction = (ip_nfrags / 2);
-	keepfraction = ip_nfrags - dropfraction;
-	for (i = IPFRAGTTL, median = 0; i >= 0; i--) {
-		median +=  fragttl_histo[i];
-		if (median >= keepfraction)
-			break;
-	}
-
-	/* Return TTL of median (or other fraction). */
-	return (u_int)i;
-}
-
-void
-ip_reass_drophalf(void)
-{
-
-	u_int median_ticks;
-	/*
-	 * Compute median TTL of all fragments, and count frags
-	 * with that TTL or lower (roughly half of all fragments).
-	 */
-	median_ticks = ip_reass_ttl_decr(0);
-
-	/* Drop half. */
-	median_ticks = ip_reass_ttl_decr(median_ticks);
-
-}
-
-/*
- * IP timer processing;
- * if a timer expires on a reassembly
- * queue, discard it.
+ * IP timer processing.
 */
 void
 ip_slowtimo(void)
 {
-	static u_int dropscanidx = 0;
-	u_int i;
-	u_int median_ttl;

 	mutex_enter(softnet_lock);
 	KERNEL_LOCK(1, NULL);

-	IPQ_LOCK();
-
-	/* Age TTL of all fragments by 1 tick .*/
-	median_ttl = ip_reass_ttl_decr(1);
-
-	/* make sure fragment limit is up-to-date */
-	CHECK_NMBCLUSTER_PARAMS();
-
-	/* If we have too many fragments, drop the older half. */
-	if (ip_nfrags > ip_maxfrags)
-		ip_reass_ttl_decr(median_ttl);
-
-	/*
-	 * If we are over the maximum number of fragmented packets
-	 * (due to the limit being lowered), drain off
-	 * enough to get down to the new limit. Start draining
-	 * from the reassembly hashqueue most recently drained.
-	 */
-	if (ip_maxfragpackets < 0)
-		;
-	else {
-		int wrapped = 0;
-
-		i = dropscanidx;
-		while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) {
-			while (LIST_FIRST(&ipq[i]) != NULL)
-				ip_freef(LIST_FIRST(&ipq[i]));
-			if (++i >= IPREASS_NHASH) {
-				i = 0;
-			}
-			/*
-			 * Dont scan forever even if fragment counters are
-			 * wrong: stop after scanning entire reassembly queue.
-			 */
-			if (i == dropscanidx)
-			    wrapped = 1;
-		}
-		dropscanidx = i;
-	}
-	IPQ_UNLOCK();
+	ip_reass_slowtimo();

 	KERNEL_UNLOCK_ONE(NULL);
 	mutex_exit(softnet_lock);
 }

 /*
- * Drain off all datagram fragments.  Don't acquire softnet_lock as
- * can be called from hardware interrupt context.
+ * IP drain processing.
 */
 void
 ip_drain(void)
 {

 	KERNEL_LOCK(1, NULL);
-
-	/*
-	 * We may be called from a device's interrupt context.  If
-	 * the ipq is already busy, just bail out now.
-	 */
-	if (ipq_lock_try() != 0) {
-		/*
-		 * Drop half the total fragments now. If more mbufs are
-		 * needed, we will be called again soon.
-		 */
-		ip_reass_drophalf();
-		IPQ_UNLOCK();
-	}
-
+	ip_reass_drain();
 	KERNEL_UNLOCK_ONE(NULL);
 }

@ -2430,14 +1927,6 @@ sysctl_net_inet_ip_setup(struct sysctllog **clog)
 		       CTL_NET, PF_INET, IPPROTO_IP,
 		       IPCTL_LOWPORTMAX, CTL_EOL);
 #endif /* IPNOPRIVPORTS */
-	sysctl_createv(clog, 0, NULL, NULL,
-		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
-		       CTLTYPE_INT, "maxfragpackets",
-		       SYSCTL_DESCR("Maximum number of fragments to retain for "
-				    "possible reassembly"),
-		       NULL, 0, &ip_maxfragpackets, 0,
-		       CTL_NET, PF_INET, IPPROTO_IP,
-		       IPCTL_MAXFRAGPACKETS, CTL_EOL);
 #if NGRE > 0
 	sysctl_createv(clog, 0, NULL, NULL,
 		       CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
--- a/sys/netinet/ip_reass.c
+++ b/sys/netinet/ip_reass.c
@ -0,0 +1,677 @@
+/*	$NetBSD: ip_reass.c,v 1.1 2010/07/13 22:16:10 rmind Exp $	*/
+
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ip_input.c	8.2 (Berkeley) 1/4/94
+ */
+
+/*
+ * IP reassembly.
+ *
+ * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for IP
+ * reassembly queue buffer managment.
+ *
+ * We keep a count of total IP fragments (NB: not fragmented packets),
+ * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments.
+ * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the total
+ * fragments in reassembly queues.  This AIMD policy avoids repeatedly
+ * deleting single packets under heavy fragmentation load (e.g., from lossy
+ * NFS peers).
+ */
+
+#include <sys/cdefs.h>
+__KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.1 2010/07/13 22:16:10 rmind Exp $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/domain.h>
+#include <sys/protosw.h>
+#include <sys/pool.h>
+#include <sys/sysctl.h>
+
+#include <net/if.h>
+#include <net/route.h>
+
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/in_pcb.h>
+#include <netinet/in_proto.h>
+#include <netinet/ip_private.h>
+#include <netinet/in_var.h>
+#include <netinet/ip_var.h>
+
+/*
+ * IP datagram reassembly hashed queues, pool, lock and counters.
+ */
+#define	IPREASS_HASH_SHIFT	6
+#define	IPREASS_HASH_SIZE	(1 << IPREASS_HASH_SHIFT)
+#define	IPREASS_HASH_MASK	(IPREASS_HASH_SIZE - 1)
+#define	IPREASS_HASH(x, y) \
+	(((((x) & 0xf) | ((((x) >> 8) & 0xf) << 4)) ^ (y)) & IPREASS_HASH_MASK)
+
+struct ipqhead	ipq[IPREASS_HASH_SIZE];
+struct pool	ipqent_pool;
+static int	ipq_locked;
+
+static int	ip_nfragpackets;	/* packets in reass queue */
+static int	ip_nfrags;		/* total fragments in reass queues */
+
+static int	ip_maxfragpackets;	/* limit on packets. XXX sysctl */
+static int	ip_maxfrags;		/* limit on fragments. XXX sysctl */
+
+/*
+ * Cached copy of nmbclusters. If nbclusters is different,
+ * recalculate IP parameters derived from nmbclusters.
+ */
+static int	ip_nmbclusters;			/* copy of nmbclusters */
+
+/*
+ * IP reassembly TTL machinery for multiplicative drop.
+ */
+static u_int	fragttl_histo[IPFRAGTTL + 1];
+
+void		sysctl_ip_reass_setup(void);
+static void	ip_nmbclusters_changed(void);
+static u_int	ip_reass_ttl_decr(u_int ticks);
+static void	ip_reass_drophalf(void);
+
+/*
+ * ip_reass_init:
+ *
+ *	Initialization of IP reassembly mechanism.
+ */
+void
+ip_reass_init(void)
+{
+	int i;
+
+	pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
+	    NULL, IPL_VM);
+
+	for (i = 0; i < IPREASS_HASH_SIZE; i++) {
+		LIST_INIT(&ipq[i]);
+	}
+	ip_maxfragpackets = 200;
+	ip_maxfrags = 0;
+	ip_nmbclusters_changed();
+
+	sysctl_ip_reass_setup();
+}
+
+static struct sysctllog *ip_reass_sysctllog;
+
+void
+sysctl_ip_reass_setup(void)
+{
+
+	sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
+		CTLFLAG_PERMANENT,
+		CTLTYPE_NODE, "net", NULL,
+		NULL, 0, NULL, 0,
+		CTL_NET, CTL_EOL);
+	sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
+		CTLFLAG_PERMANENT,
+		CTLTYPE_NODE, "inet",
+		SYSCTL_DESCR("PF_INET related settings"),
+		NULL, 0, NULL, 0,
+		CTL_NET, PF_INET, CTL_EOL);
+	sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
+		CTLFLAG_PERMANENT,
+		CTLTYPE_NODE, "ip",
+		SYSCTL_DESCR("IPv4 related settings"),
+		NULL, 0, NULL, 0,
+		CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL);
+
+	sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
+		CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
+		CTLTYPE_INT, "maxfragpackets",
+		SYSCTL_DESCR("Maximum number of fragments to retain for "
+			     "possible reassembly"),
+		NULL, 0, &ip_maxfragpackets, 0,
+		CTL_NET, PF_INET, IPPROTO_IP, IPCTL_MAXFRAGPACKETS, CTL_EOL);
+}
+
+#define CHECK_NMBCLUSTER_PARAMS()				\
+do {								\
+	if (__predict_false(ip_nmbclusters != nmbclusters))	\
+		ip_nmbclusters_changed();			\
+} while (/*CONSTCOND*/0)
+
+/*
+ * Compute IP limits derived from the value of nmbclusters.
+ */
+static void
+ip_nmbclusters_changed(void)
+{
+	ip_maxfrags = nmbclusters / 4;
+	ip_nmbclusters = nmbclusters;
+}
+
+static inline int	ipq_lock_try(void);
+static inline void	ipq_unlock(void);
+
+static inline int
+ipq_lock_try(void)
+{
+	int s;
+
+	/*
+	 * Use splvm() -- we're blocking things that would cause
+	 * mbuf allocation.
+	 */
+	s = splvm();
+	if (ipq_locked) {
+		splx(s);
+		return (0);
+	}
+	ipq_locked = 1;
+	splx(s);
+	return (1);
+}
+
+static inline void
+ipq_unlock(void)
+{
+	int s;
+
+	s = splvm();
+	ipq_locked = 0;
+	splx(s);
+}
+
+#ifdef DIAGNOSTIC
+#define	IPQ_LOCK()							\
+do {									\
+	if (ipq_lock_try() == 0) {					\
+		printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \
+		panic("ipq_lock");					\
+	}								\
+} while (/*CONSTCOND*/ 0)
+#define	IPQ_LOCK_CHECK()						\
+do {									\
+	if (ipq_locked == 0) {						\
+		printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \
+		panic("ipq lock check");				\
+	}								\
+} while (/*CONSTCOND*/ 0)
+#else
+#define	IPQ_LOCK()		(void) ipq_lock_try()
+#define	IPQ_LOCK_CHECK()	/* nothing */
+#endif
+
+#define	IPQ_UNLOCK()		ipq_unlock()
+
+/*
+ * ip_reass_lookup:
+ *
+ *	Look for queue of fragments of this datagram.
+ */
+struct ipq *
+ip_reass_lookup(struct ip *ip, u_int *hashp)
+{
+	struct ipq *fp;
+	u_int hash;
+
+	IPQ_LOCK();
+	hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
+	LIST_FOREACH(fp, &ipq[hash], ipq_q) {
+		if (ip->ip_id != fp->ipq_id)
+			continue;
+		if (!in_hosteq(ip->ip_src, fp->ipq_src))
+			continue;
+		if (!in_hosteq(ip->ip_dst, fp->ipq_dst))
+			continue;
+		if (ip->ip_p != fp->ipq_p)
+			continue;
+		break;
+	}
+	*hashp = hash;
+	return fp;
+}
+
+void
+ip_reass_unlock(void)
+{
+
+	IPQ_UNLOCK();
+}
+
+struct ipqent *
+ip_reass_getent(void)
+{
+	struct ipqent *ipqe;
+	int s;
+
+	IP_STATINC(IP_STAT_FRAGMENTS);
+	s = splvm();
+	ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
+	splx(s);
+
+	return ipqe;
+}
+
+/*
+ * ip_reass:
+ *
+ *	Take incoming datagram fragment and try to reassemble it into whole
+ *	datagram.  If a chain for reassembly of this datagram already exists,
+ *	then it is given as 'fp'; otherwise have to make a chain.
+ */
+struct mbuf *
+ip_reass(struct ipqent *ipqe, struct ipq *fp, u_int hash)
+{
+	struct ipqhead *ipqhead = &ipq[hash];
+	const int hlen = ipqe->ipqe_ip->ip_hl << 2;
+	struct mbuf *m = ipqe->ipqe_m, *t;
+	struct ipqent *nq, *p, *q;
+	struct ip *ip;
+	int i, next, s;
+
+	IPQ_LOCK_CHECK();
+
+	/*
+	 * Presence of header sizes in mbufs would confuse code below.
+	 */
+	m->m_data += hlen;
+	m->m_len -= hlen;
+
+#ifdef	notyet
+	/* Make sure fragment limit is up-to-date. */
+	CHECK_NMBCLUSTER_PARAMS();
+
+	/* If we have too many fragments, drop the older half. */
+	if (ip_nfrags >= ip_maxfrags) {
+		ip_reass_drophalf(void);
+	}
+#endif
+
+	/*
+	 * We are about to add a fragment; increment frag count.
+	 */
+	ip_nfrags++;
+
+	/*
+	 * If first fragment to arrive, create a reassembly queue.
+	 */
+	if (fp == NULL) {
+		/*
+		 * Enforce upper bound on number of fragmented packets
+		 * for which we attempt reassembly:  a) if maxfrag is 0,
+		 * never accept fragments  b) if maxfrag is -1, accept
+		 * all fragments without limitation.
+		 */
+		if (ip_maxfragpackets < 0)
+			;
+		else if (ip_nfragpackets >= ip_maxfragpackets) {
+			goto dropfrag;
+		}
+		ip_nfragpackets++;
+		fp = malloc(sizeof(struct ipq), M_FTABLE, M_NOWAIT);
+		if (fp == NULL) {
+			goto dropfrag;
+		}
+		LIST_INSERT_HEAD(ipqhead, fp, ipq_q);
+		fp->ipq_nfrags = 1;
+		fp->ipq_ttl = IPFRAGTTL;
+		fp->ipq_p = ipqe->ipqe_ip->ip_p;
+		fp->ipq_id = ipqe->ipqe_ip->ip_id;
+		fp->ipq_tos = ipqe->ipqe_ip->ip_tos;
+		TAILQ_INIT(&fp->ipq_fragq);
+		fp->ipq_src = ipqe->ipqe_ip->ip_src;
+		fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
+		p = NULL;
+		goto insert;
+	} else {
+		fp->ipq_nfrags++;
+	}
+
+	/*
+	 * Find a segment which begins after this one does.
+	 */
+	for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
+	    p = q, q = TAILQ_NEXT(q, ipqe_q))
+		if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
+			break;
+
+	/*
+	 * If there is a preceding segment, it may provide some of our
+	 * data already.  If so, drop the data from the incoming segment.
+	 * If it provides all of our data, drop us.
+	 */
+	if (p != NULL) {
+		i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
+		    ntohs(ipqe->ipqe_ip->ip_off);
+		if (i > 0) {
+			if (i >= ntohs(ipqe->ipqe_ip->ip_len)) {
+				goto dropfrag;
+			}
+			m_adj(ipqe->ipqe_m, i);
+			ipqe->ipqe_ip->ip_off =
+			    htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
+			ipqe->ipqe_ip->ip_len =
+			    htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
+		}
+	}
+
+	/*
+	 * While we overlap succeeding segments trim them or, if they are
+	 * completely covered, dequeue them.
+	 */
+	for (; q != NULL &&
+	    ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
+	    ntohs(q->ipqe_ip->ip_off); q = nq) {
+		i = (ntohs(ipqe->ipqe_ip->ip_off) +
+		    ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
+		if (i < ntohs(q->ipqe_ip->ip_len)) {
+			q->ipqe_ip->ip_len =
+			    htons(ntohs(q->ipqe_ip->ip_len) - i);
+			q->ipqe_ip->ip_off =
+			    htons(ntohs(q->ipqe_ip->ip_off) + i);
+			m_adj(q->ipqe_m, i);
+			break;
+		}
+		nq = TAILQ_NEXT(q, ipqe_q);
+		m_freem(q->ipqe_m);
+		TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
+		s = splvm();
+		pool_put(&ipqent_pool, q);
+		splx(s);
+		fp->ipq_nfrags--;
+		ip_nfrags--;
+	}
+
+insert:
+	/*
+	 * Stick new segment in its place; check for complete reassembly.
+	 */
+	if (p == NULL) {
+		TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
+	} else {
+		TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q);
+	}
+	next = 0;
+	for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
+	    p = q, q = TAILQ_NEXT(q, ipqe_q)) {
+		if (ntohs(q->ipqe_ip->ip_off) != next) {
+			IPQ_UNLOCK();
+			return NULL;
+		}
+		next += ntohs(q->ipqe_ip->ip_len);
+	}
+	if (p->ipqe_mff) {
+		IPQ_UNLOCK();
+		return NULL;
+	}
+	/*
+	 * Reassembly is complete.  Check for a bogus message size and
+	 * concatenate fragments.
+	 */
+	q = TAILQ_FIRST(&fp->ipq_fragq);
+	ip = q->ipqe_ip;
+	if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
+		IP_STATINC(IP_STAT_TOOLONG);
+		ip_freef(fp);
+		IPQ_UNLOCK();
+		return NULL;
+	}
+	m = q->ipqe_m;
+	t = m->m_next;
+	m->m_next = NULL;
+	m_cat(m, t);
+	nq = TAILQ_NEXT(q, ipqe_q);
+	s = splvm();
+	pool_put(&ipqent_pool, q);
+	splx(s);
+	for (q = nq; q != NULL; q = nq) {
+		t = q->ipqe_m;
+		nq = TAILQ_NEXT(q, ipqe_q);
+		s = splvm();
+		pool_put(&ipqent_pool, q);
+		splx(s);
+		m_cat(m, t);
+	}
+	ip_nfrags -= fp->ipq_nfrags;
+
+	/*
+	 * Create header for new packet by modifying header of first
+	 * packet.  Dequeue and discard fragment reassembly header.  Make
+	 * header visible.
+	 */
+	ip->ip_len = htons(next);
+	ip->ip_src = fp->ipq_src;
+	ip->ip_dst = fp->ipq_dst;
+	LIST_REMOVE(fp, ipq_q);
+	free(fp, M_FTABLE);
+	ip_nfragpackets--;
+	m->m_len += (ip->ip_hl << 2);
+	m->m_data -= (ip->ip_hl << 2);
+	/* some debugging cruft by sklower, below, will go away soon */
+	if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
+		int plen = 0;
+		for (t = m; t; t = t->m_next) {
+			plen += t->m_len;
+		}
+		m->m_pkthdr.len = plen;
+		m->m_pkthdr.csum_flags = 0;
+	}
+	IPQ_UNLOCK();
+	return m;
+
+dropfrag:
+	if (fp != NULL) {
+		fp->ipq_nfrags--;
+	}
+	ip_nfrags--;
+	IP_STATINC(IP_STAT_FRAGDROPPED);
+	m_freem(m);
+	s = splvm();
+	pool_put(&ipqent_pool, ipqe);
+	splx(s);
+	IPQ_UNLOCK();
+	return NULL;
+}
+
+/*
+ * ip_freef:
+ *
+ *	Free a fragment reassembly header and all associated datagrams.
+ */
+void
+ip_freef(struct ipq *fp)
+{
+	struct ipqent *q, *p;
+	u_int nfrags = 0;
+	int s;
+
+	IPQ_LOCK_CHECK();
+
+	for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) {
+		p = TAILQ_NEXT(q, ipqe_q);
+		m_freem(q->ipqe_m);
+		nfrags++;
+		TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
+		s = splvm();
+		pool_put(&ipqent_pool, q);
+		splx(s);
+	}
+
+	if (nfrags != fp->ipq_nfrags) {
+		printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags);
+	}
+	ip_nfrags -= nfrags;
+	LIST_REMOVE(fp, ipq_q);
+	free(fp, M_FTABLE);
+	ip_nfragpackets--;
+}
+
+/*
+ * ip_reass_ttl_decr:
+ *
+ *	Decrement TTL of all reasembly queue entries by `ticks'.  Count
+ *	number of distinct fragments (as opposed to partial, fragmented
+ *	datagrams) inthe reassembly queue.  While we  traverse the entire
+ *	reassembly queue, compute and return the median TTL over all
+ *	fragments.
+ */
+static u_int
+ip_reass_ttl_decr(u_int ticks)
+{
+	u_int nfrags, median, dropfraction, keepfraction;
+	struct ipq *fp, *nfp;
+	int i;
+
+	nfrags = 0;
+	memset(fragttl_histo, 0, sizeof(fragttl_histo));
+
+	for (i = 0; i < IPREASS_HASH_SIZE; i++) {
+		for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
+			fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ?
+			    0 : fp->ipq_ttl - ticks);
+			nfp = LIST_NEXT(fp, ipq_q);
+			if (fp->ipq_ttl == 0) {
+				IP_STATINC(IP_STAT_FRAGTIMEOUT);
+				ip_freef(fp);
+			} else {
+				nfrags += fp->ipq_nfrags;
+				fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
+			}
+		}
+	}
+
+	KASSERT(ip_nfrags == nfrags);
+
+	/* Find median (or other drop fraction) in histogram. */
+	dropfraction = (ip_nfrags / 2);
+	keepfraction = ip_nfrags - dropfraction;
+	for (i = IPFRAGTTL, median = 0; i >= 0; i--) {
+		median += fragttl_histo[i];
+		if (median >= keepfraction)
+			break;
+	}
+
+	/* Return TTL of median (or other fraction). */
+	return (u_int)i;
+}
+
+static void
+ip_reass_drophalf(void)
+{
+	u_int median_ticks;
+
+	/*
+	 * Compute median TTL of all fragments, and count frags
+	 * with that TTL or lower (roughly half of all fragments).
+	 */
+	median_ticks = ip_reass_ttl_decr(0);
+
+	/* Drop half. */
+	median_ticks = ip_reass_ttl_decr(median_ticks);
+}
+
+/*
+ * ip_reass_drain: drain off all datagram fragments.  Do not acquire
+ * softnet_lock as can be called from hardware interrupt context.
+ */
+void
+ip_reass_drain(void)
+{
+
+	/*
+	 * We may be called from a device's interrupt context.  If
+	 * the ipq is already busy, just bail out now.
+	 */
+	if (ipq_lock_try() != 0) {
+		/*
+		 * Drop half the total fragments now. If more mbufs are
+		 * needed, we will be called again soon.
+		 */
+		ip_reass_drophalf();
+		IPQ_UNLOCK();
+	}
+}
+
+/*
+ * ip_reass_slowtimo:
+ *
+ *	If a timer expires on a reassembly queue, discard it.
+ */
+void
+ip_reass_slowtimo(void)
+{
+	static u_int dropscanidx = 0;
+	u_int i, median_ttl;
+
+	IPQ_LOCK();
+
+	/* Age TTL of all fragments by 1 tick .*/
+	median_ttl = ip_reass_ttl_decr(1);
+
+	/* Make sure fragment limit is up-to-date. */
+	CHECK_NMBCLUSTER_PARAMS();
+
+	/* If we have too many fragments, drop the older half. */
+	if (ip_nfrags > ip_maxfrags) {
+		ip_reass_ttl_decr(median_ttl);
+	}
+
+	/*
+	 * If we are over the maximum number of fragmented packets (due to
+	 * the limit being lowered), drain off enough to get down to the
+	 * new limit.  Start draining from the reassembly hashqueue most
+	 * recently drained.
+	 */
+	if (ip_maxfragpackets < 0)
+		;
+	else {
+		int wrapped = 0;
+
+		i = dropscanidx;
+		while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) {
+			while (LIST_FIRST(&ipq[i]) != NULL) {
+				ip_freef(LIST_FIRST(&ipq[i]));
+			}
+			if (++i >= IPREASS_HASH_SIZE) {
+				i = 0;
+			}
+			/*
+			 * Do not scan forever even if fragment counters are
+			 * wrong: stop after scanning entire reassembly queue.
+			 */
+			if (i == dropscanidx) {
+				wrapped = 1;
+			}
+		}
+		dropscanidx = i;
+	}
+	IPQ_UNLOCK();
+}
--- a/sys/netinet/ip_var.h
+++ b/sys/netinet/ip_var.h
@ -1,4 +1,4 @@
-/*	$NetBSD: ip_var.h,v 1.91 2009/02/01 17:04:11 pooka Exp $	*/
+/*	$NetBSD: ip_var.h,v 1.92 2010/07/13 22:16:10 rmind Exp $	*/

 /*
 * Copyright (c) 1982, 1986, 1993
@ -198,7 +198,6 @@ extern int ip_maxflows;
 extern int ip_hashsize;
 #endif
 extern struct pool inmulti_pool;
-extern struct pool ipqent_pool;
 struct	 inpcb;
 struct   sockopt;

@ -206,7 +205,6 @@ int	 ip_ctloutput(int, struct socket *, struct sockopt *);
 int	 ip_dooptions(struct mbuf *);
 void	 ip_drain(void);
 void	 ip_forward(struct mbuf *, int);
-void	 ip_freef(struct ipq *);
 void	 ip_freemoptions(struct ip_moptions *);
 int	 ip_getmoptions(struct ip_moptions *, struct sockopt *);
 void	 ip_init(void);
@ -215,8 +213,18 @@ u_int	 ip_optlen(struct inpcb *);
 int	 ip_output(struct mbuf *, ...);
 int	 ip_fragment(struct mbuf *, struct ifnet *, u_long);
 int	 ip_pcbopts(struct mbuf **, const struct sockopt *);
+
+struct ipq *
+	 ip_reass_lookup(struct ip *, u_int *);
+void	 ip_reass_unlock(void);
+struct ipqent *
+	ip_reass_getent(void);
 struct mbuf *
-	 ip_reass(struct ipqent *, struct ipq *, struct ipqhead *);
+	 ip_reass(struct ipqent *, struct ipq *, u_int);
+void	 ip_reass_slowtimo(void);
+void	 ip_reass_drain(void);
+void	 ip_freef(struct ipq *);
+
 struct in_ifaddr *
 	 ip_rtaddr(struct in_addr);
 void	 ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,
--- a/sys/rump/net/lib/libnetinet/Makefile.inc
+++ b/sys/rump/net/lib/libnetinet/Makefile.inc
@ -1,12 +1,12 @@
-#	$NetBSD: Makefile.inc,v 1.5 2010/02/16 20:42:47 pooka Exp $
+#	$NetBSD: Makefile.inc,v 1.6 2010/07/13 22:16:10 rmind Exp $
 #

 .PATH:	${.CURDIR}/../../../../netinet ${.CURDIR}/../../../../netinet6

 # INET
 SRCS+=	in_proto.c igmp.c in.c in_offload.c in_pcb.c ip_icmp.c		\
-	ip_flow.c ip_id.c ip_input.c ip_output.c raw_ip.c in_cksum.c	\
-	cpu_in_cksum.c in4_cksum.c ip_encap.c
+	ip_flow.c ip_id.c ip_input.c ip_reass.c ip_output.c raw_ip.c	\
+	in_cksum.c cpu_in_cksum.c in4_cksum.c ip_encap.c

 # INET6
 SRCS+=	dest6.c frag6.c icmp6.c in6.c in6_cksum.c in6_ifattach.c	\