Second part of hashed IP_reassembly changes:

When under pressure for mbufs or we have too many fragments in the IP reassembly queue, drop half of all fragments. This multiplicative-drop strategy ensures we return to a healthy state, even under borderline denial-of-service from extremely lossy NFS-over-UDP peers. The multiplicative-drop phase currently drops 50% of fragments, but has pre-placed support for implementing drop-fractions other than 50% The threshhold for the `drop-half' phase is the new variable, ip_maxfrags which is calculated as nmbclusters/4. ip_input.c now keeps ip_nmbclusters, a cached copy of nmbclusters. Before using limits derived from nmbclusters, we check if nmbclusters and ip_nmclusters are equal. If not, we recompute Ip parameters derived from nmbclusters. Based on a suggestion by Jason Thorpe. ip_maxfrags is currently auto-recalcuated. The counters ip_nfrags and ip_nfragpacketsr are now declared static and uninitialized (bss), to discourage tampering with them.
2003-12-14 00:09:24 +00:00 · 2003-12-14 00:09:24 +00:00 · 9c1a5c5570
commit 9c1a5c5570
parent 084803988a
1 changed files with 144 additions and 26 deletions
--- a/sys/netinet/ip_input.c
+++ b/sys/netinet/ip_input.c
@ -1,4 +1,4 @@
-/*	$NetBSD: ip_input.c,v 1.193 2003/12/12 21:17:59 scw Exp $	*/
+/*	$NetBSD: ip_input.c,v 1.194 2003/12/14 00:09:24 jonathan Exp $	*/

 /*
 * Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@ -98,7 +98,7 @@
 */

 #include <sys/cdefs.h>
-__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.193 2003/12/12 21:17:59 scw Exp $");
+__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.194 2003/12/14 00:09:24 jonathan Exp $");

 #include "opt_inet.h"
 #include "opt_gateway.h"
@ -236,6 +236,19 @@ uint16_t ip_id;
 struct pfil_head inet_pfil_hook;
 #endif

+/*
+ * Cached copy of nmbclusters. If nbclusters is different,
+ * recalculate IP parameters derived from nmbclusters.
+ */
+static int	ip_nmbclusters;			/* copy of nmbclusters */
+static void	ip_nmbclusters_changed __P((void));	/* recalc limits */
+
+#define CHECK_NMBCLUSTER_PARAMS() \
+do { if __predict_false(ip_nmbclusters != nmbclusters)	\
+	ip_nmbclusters_changed();			\
+} while  (0)
+
+
 /* IP datagram reassembly queues (hashed) */
 #define IPREASS_NHASH_LOG2      6
 #define IPREASS_NHASH           (1 << IPREASS_NHASH_LOG2)
@ -244,9 +257,27 @@ struct pfil_head inet_pfil_hook;
 	(((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
 struct ipqhead ipq[IPREASS_NHASH];
 int	ipq_locked;
-int	ip_nfragpackets = 0;
-int	ip_maxfragpackets = 200;
-int	ip_nfrags = 0;         /* total fragments in reass queues */
+static int	ip_nfragpackets;	/* packets in reass queue */ 
+static int	ip_nfrags;		/* total fragments in reass queues */
+
+int	ip_maxfragpackets = 200;	/* limit on packets. XXX sysctl */
+int	ip_maxfrags;		        /* limit on fragments. XXX sysctl */
+
+
+/*
+ * Additive-Increase/Multiplicative-Decrease (AIMD) strategy for
+ * IP reassembly queue buffer managment.
+ * 
+ * We keep a count of total IP fragments (NB: not fragmented packets!)
+ * awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments.
+ * If ip_nfrags exceeds ip_maxfrags the limit, we drop half the
+ * total fragments in  reassembly queues.This AIMD policy avoids
+ * repeatedly deleting single packets under heavy fragmentation load
+ * (e.g., from lossy NFS peers).
+ */
+static u_int	ip_reass_ttl_decr __P((u_int ticks)); 
+static void	ip_reass_drophalf __P((void));
+

 static __inline int ipq_lock_try __P((void));
 static __inline void ipq_unlock __P((void));
@ -345,6 +376,16 @@ struct mowner ip_rx_mowner = { "internet", "rx" };
 struct mowner ip_tx_mowner = { "internet", "tx" };
 #endif

+/*
+ * Compute IP limits derived from the value of nmbclusters.
+ */
+static void
+ip_nmbclusters_changed(void)
+{
+	ip_maxfrags = nmbclusters / 4;
+	ip_nmbclusters =  nmbclusters;
+}
+
 /*
 * IP initialization: fill in IP protocol switch table.
 * All protocols not implemented in kernel go to raw IP protocol handler.
@ -375,7 +416,10 @@ ip_init()
 	    	LIST_INIT(&ipq[i]);

 	ip_id = time.tv_sec & 0xfffff;
+
 	ipintrq.ifq_maxlen = ipqmaxlen;
+	ip_nmbclusters_changed();
+
 	TAILQ_INIT(&in_ifaddrhead);
 	in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, M_IFADDR,
 	    M_WAITOK, &in_ifaddrhash);
@ -1023,6 +1067,15 @@ ip_reass(ipqe, fp, ipqhead)
 	m->m_data += hlen;
 	m->m_len -= hlen;

+#ifdef	notyet
+	/* make sure fragment limit is up-to-date */
+	CHECK_NMBCLUSTER_PARAMS();
+
+	/* If we have too many fragments, drop the older half. */
+	if (ip_nfrags >= ip_maxfrags)
+		ip_reass_drophalf(void);
+#endif
+
 	/*
 	 * We are about to add a fragment; increment frag count.
 	 */
@ -1220,6 +1273,74 @@ ip_freef(fp)
 	ip_nfragpackets--;
 }

+/*
+ * IP reassembly TTL machinery for  multiplicative drop.
+ */
+static u_int	fragttl_histo[(IPFRAGTTL+1)];
+
+
+/*
+ * Decrement TTL of all reasembly queue entries by `ticks'.
+ * Count number of distinct fragments (as opposed to partial, fragmented
+ * datagrams) in the reassembly queue.  While we  traverse the entire
+ * reassembly queue, compute and return the median TTL over all fragments.
+ */
+static u_int
+ip_reass_ttl_decr(u_int ticks)
+{
+	u_int i, nfrags, median;
+	struct ipq *fp, *nfp;
+	u_int dropfraction, keepfraction;
+	
+	nfrags = 0;
+	memset(fragttl_histo, 0, sizeof fragttl_histo);
+	
+	for (i = 0; i < IPREASS_NHASH; i++) {
+		for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
+			fp->ipq_ttl = ((fp->ipq_ttl  <= ticks) ?
+				       0 : fp->ipq_ttl - ticks);
+			nfp = LIST_NEXT(fp, ipq_q);
+			if (fp->ipq_ttl == 0) {
+				ipstat.ips_fragtimeout++;
+				ip_freef(fp);
+			} else {
+				nfrags += fp->ipq_nfrags;
+				fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
+			}
+		}
+	}
+
+	KASSERT(ip_nfrags == nfrags);
+
+	/* Find median (or other drop fraction) in histogram. */
+	dropfraction = (ip_nfrags / 2);
+	keepfraction = ip_nfrags - dropfraction;
+	for (i = IPFRAGTTL, median = 0; i >= 0; i--) {
+		median +=  fragttl_histo[i];
+		if (median >= keepfraction)
+			break;
+	}
+
+	/* Return TTL of median (or other fraction). */
+	return (u_int)i;
+}
+
+void
+ip_reass_drophalf(void)
+{
+
+	u_int median_ticks;
+	/*
+	 * Compute median TTL of all fragments, and count frags
+	 * with that TTL or lower (roughly half of all fragments).
+	 */
+	median_ticks = ip_reass_ttl_decr(0);
+
+	/* Drop half. */
+	median_ticks = ip_reass_ttl_decr(median_ticks);
+
+}
+
 /*
 * IP timer processing;
 * if a timer expires on a reassembly
@ -1230,21 +1351,23 @@ ip_slowtimo()
 {
 	static u_int dropscanidx = 0;
 	u_int i;
-	struct ipq *fp, *nfp;
+	u_int median_ttl;
 	int s = splsoftnet();

 	IPQ_LOCK();
-	for (i = 0; i < IPREASS_NHASH; i++) {
-		for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
-			nfp = LIST_NEXT(fp, ipq_q);
-			if (--fp->ipq_ttl == 0) {
-				ipstat.ips_fragtimeout++;
-				ip_freef(fp);
-			}
-		}
-	}
+
+	/* Age TTL of all fragments by 1 tick .*/
+	median_ttl = ip_reass_ttl_decr(1);
+
+	/* make sure fragment limit is up-to-date */
+	CHECK_NMBCLUSTER_PARAMS();
+
+	/* If we have too many fragments, drop the older half. */
+	if (ip_nfrags > ip_maxfrags)
+		ip_reass_ttl_decr(median_ttl);
+
 	/*
-	 * If we are over the maximum number of fragments
+	 * If we are over the maximum number of fragmented packets
 	 * (due to the limit being lowered), drain off
 	 * enough to get down to the new limit. Start draining
 	 * from the reassembly hashqueue most recently drained.
@ -1283,7 +1406,6 @@ ip_slowtimo()
 void
 ip_drain()
 {
-	int i;

 	/*
 	 * We may be called from a device's interrupt context.  If
@ -1292,15 +1414,11 @@ ip_drain()
 	if (ipq_lock_try() == 0)
 		return;

-	for (i = 0; i < IPREASS_NHASH; i++) {
-		struct ipqhead *ipqh = &ipq[i];
-		struct ipq *fp, *nfp;
-		for (fp = LIST_FIRST(ipqh); fp != NULL; fp = nfp) {
-			nfp = LIST_NEXT(fp, ipq_q);
-			ip_freef(fp);
-			ipstat.ips_fragdropped++;
-		}
-	}
+	/*
+	 * Drop half the total fragments now. If more mbufs are needed,
+	 *  we will be called again soon.
+	 */
+	ip_reass_drophalf();

 	IPQ_UNLOCK();
 }