Split-off IPv4 re-assembly mechanism into a separate module. Abstract

into ip_reass_init(), ip_reass_lookup(), etc (note: abstraction is not
yet complete).  No functional changes to the actual mechanism.

OK matt@
This commit is contained in:
rmind 2010-07-13 22:16:10 +00:00
parent 29dd668442
commit bcc65ff09f
6 changed files with 726 additions and 550 deletions

View File

@ -1,4 +1,4 @@
# $NetBSD: files.netinet,v 1.20 2008/01/25 21:12:14 joerg Exp $
# $NetBSD: files.netinet,v 1.21 2010/07/13 22:16:10 rmind Exp $
defflag opt_tcp_debug.h TCP_DEBUG
defparam opt_tcp_debug.h TCP_NDEBUG
@ -29,6 +29,7 @@ file netinet/ip_id.c inet
file netinet/ip_input.c inet
file netinet/ip_mroute.c inet & mrouting
file netinet/ip_output.c inet
file netinet/ip_reass.c inet
file netinet/raw_ip.c inet
file netinet/tcp_debug.c (inet | inet6) & tcp_debug

View File

@ -1,4 +1,4 @@
/* $NetBSD: in_var.h,v 1.62 2008/04/28 20:24:09 martin Exp $ */
/* $NetBSD: in_var.h,v 1.63 2010/07/13 22:16:10 rmind Exp $ */
/*-
* Copyright (c) 1998 The NetBSD Foundation, Inc.
@ -300,6 +300,7 @@ int in_control(struct socket *, u_long, void *, struct ifnet *,
struct lwp *);
void in_purgeaddr(struct ifaddr *);
void in_purgeif(struct ifnet *);
void ip_reass_init(void);
void ip_input(struct mbuf *);
int ipflow_fastforward(struct mbuf *);
void ip_initid(void);

View File

@ -1,4 +1,4 @@
/* $NetBSD: ip_input.c,v 1.287 2010/07/09 18:42:46 rmind Exp $ */
/* $NetBSD: ip_input.c,v 1.288 2010/07/13 22:16:10 rmind Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@ -91,7 +91,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.287 2010/07/09 18:42:46 rmind Exp $");
__KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.288 2010/07/13 22:16:10 rmind Exp $");
#include "opt_inet.h"
#include "opt_compat_netbsd.h"
@ -104,7 +104,6 @@ __KERNEL_RCSID(0, "$NetBSD: ip_input.c,v 1.287 2010/07/09 18:42:46 rmind Exp $")
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
@ -240,105 +239,7 @@ percpu_t *ipstat_percpu;
struct pfil_head inet_pfil_hook;
#endif
/*
* Cached copy of nmbclusters. If nbclusters is different,
* recalculate IP parameters derived from nmbclusters.
*/
static int ip_nmbclusters; /* copy of nmbclusters */
static void ip_nmbclusters_changed(void); /* recalc limits */
#define CHECK_NMBCLUSTER_PARAMS() \
do { \
if (__predict_false(ip_nmbclusters != nmbclusters)) \
ip_nmbclusters_changed(); \
} while (/*CONSTCOND*/0)
/* IP datagram reassembly queues (hashed) */
#define IPREASS_NHASH_LOG2 6
#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
#define IPREASS_HMASK (IPREASS_NHASH - 1)
#define IPREASS_HASH(x,y) \
(((((x) & 0xF) | ((((x) >> 8) & 0xF) << 4)) ^ (y)) & IPREASS_HMASK)
struct ipqhead ipq[IPREASS_NHASH];
int ipq_locked;
static int ip_nfragpackets; /* packets in reass queue */
static int ip_nfrags; /* total fragments in reass queues */
int ip_maxfragpackets = 200; /* limit on packets. XXX sysctl */
int ip_maxfrags; /* limit on fragments. XXX sysctl */
/*
* Additive-Increase/Multiplicative-Decrease (AIMD) strategy for
* IP reassembly queue buffer managment.
*
* We keep a count of total IP fragments (NB: not fragmented packets!)
* awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments.
* If ip_nfrags exceeds ip_maxfrags the limit, we drop half the
* total fragments in reassembly queues.This AIMD policy avoids
* repeatedly deleting single packets under heavy fragmentation load
* (e.g., from lossy NFS peers).
*/
static u_int ip_reass_ttl_decr(u_int ticks);
static void ip_reass_drophalf(void);
static inline int ipq_lock_try(void);
static inline void ipq_unlock(void);
static inline int
ipq_lock_try(void)
{
int s;
/*
* Use splvm() -- we're blocking things that would cause
* mbuf allocation.
*/
s = splvm();
if (ipq_locked) {
splx(s);
return (0);
}
ipq_locked = 1;
splx(s);
return (1);
}
static inline void
ipq_unlock(void)
{
int s;
s = splvm();
ipq_locked = 0;
splx(s);
}
#ifdef DIAGNOSTIC
#define IPQ_LOCK() \
do { \
if (ipq_lock_try() == 0) { \
printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \
panic("ipq_lock"); \
} \
} while (/*CONSTCOND*/ 0)
#define IPQ_LOCK_CHECK() \
do { \
if (ipq_locked == 0) { \
printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \
panic("ipq lock check"); \
} \
} while (/*CONSTCOND*/ 0)
#else
#define IPQ_LOCK() (void) ipq_lock_try()
#define IPQ_LOCK_CHECK() /* nothing */
#endif
#define IPQ_UNLOCK() ipq_unlock()
struct pool inmulti_pool;
struct pool ipqent_pool;
#ifdef INET_CSUM_COUNTERS
#include <sys/device.h>
@ -386,16 +287,6 @@ struct mowner ip_tx_mowner = MOWNER_INIT("internet", "tx");
static void sysctl_net_inet_ip_setup(struct sysctllog **);
/*
* Compute IP limits derived from the value of nmbclusters.
*/
static void
ip_nmbclusters_changed(void)
{
ip_maxfrags = nmbclusters / 4;
ip_nmbclusters = nmbclusters;
}
/*
* IP initialization: fill in IP protocol switch table.
* All protocols not implemented in kernel go to raw IP protocol handler.
@ -410,8 +301,6 @@ ip_init(void)
pool_init(&inmulti_pool, sizeof(struct in_multi), 0, 0, 0, "inmltpl",
NULL, IPL_SOFTNET);
pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
NULL, IPL_VM);
pr = pffindproto(PF_INET, IPPROTO_RAW, SOCK_RAW);
if (pr == 0)
@ -424,14 +313,12 @@ ip_init(void)
pr->pr_protocol && pr->pr_protocol != IPPROTO_RAW)
ip_protox[pr->pr_protocol] = pr - inetsw;
for (i = 0; i < IPREASS_NHASH; i++)
LIST_INIT(&ipq[i]);
ip_reass_init();
ip_initid();
ip_id = time_second & 0xfffff;
ipintrq.ifq_maxlen = ipqmaxlen;
ip_nmbclusters_changed();
TAILQ_INIT(&in_ifaddrhead);
in_ifaddrhashtbl = hashinit(IN_IFADDR_HASH_SIZE, HASH_LIST, true,
@ -515,16 +402,12 @@ void
ip_input(struct mbuf *m)
{
struct ip *ip = NULL;
struct ipq *fp;
struct in_ifaddr *ia;
struct ifaddr *ifa;
struct ipqent *ipqe;
int hlen = 0, mff, len;
int hlen = 0, len;
int downmatch;
int checkif;
int srcrt = 0;
int s;
u_int hash;
#ifdef FAST_IPSEC
struct m_tag *mtag;
struct tdb_ident *tdbi;
@ -924,13 +807,12 @@ ip_input(struct mbuf *m)
ours:
/*
* If offset or IP_MF are set, must reassemble.
* Otherwise, nothing need be done.
* (We could look in the reassembly queue to see
* if the packet was previously fragmented,
* but it's not worth the time; just let them time out.)
*/
if (ip->ip_off & ~htons(IP_DF|IP_RF)) {
u_int off;
struct ipq *fp;
u_int off, hash;
bool mff;
/*
* Prevent TCP blind data attacks by not allowing non-initial
* fragments to start at less than 68 bytes (minimal fragment
@ -944,16 +826,16 @@ ours:
}
/*
* Adjust ip_len to not reflect header,
* set ipqe_mff if more fragments are expected,
* convert offset of this to bytes.
* Adjust total IP length to not reflect header. Set 'mff'
* indicator, if more fragments are expected. Convert offset
* of this to bytes.
*/
ip->ip_len = htons(ntohs(ip->ip_len) - hlen);
mff = (ip->ip_off & htons(IP_MF)) != 0;
if (mff) {
/*
* Make sure that fragments have a data length
* that's a non-zero multiple of 8 bytes.
* which is non-zero and multiple of 8 bytes.
*/
if (ntohs(ip->ip_len) == 0 ||
(ntohs(ip->ip_len) & 0x7) != 0) {
@ -963,29 +845,14 @@ ours:
}
ip->ip_off = htons((ntohs(ip->ip_off) & IP_OFFMASK) << 3);
/*
* Look for queue of fragments of this datagram.
*/
IPQ_LOCK();
hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
LIST_FOREACH(fp, &ipq[hash], ipq_q) {
if (ip->ip_id != fp->ipq_id)
continue;
if (!in_hosteq(ip->ip_src, fp->ipq_src))
continue;
if (!in_hosteq(ip->ip_dst, fp->ipq_dst))
continue;
if (ip->ip_p != fp->ipq_p)
continue;
/*
* Make sure the TOS is matches previous fragments.
*/
if (ip->ip_tos != fp->ipq_tos) {
IP_STATINC(IP_STAT_BADFRAGS);
IPQ_UNLOCK();
goto bad;
}
break;
/* Look for queue of fragments of this datagram. */
fp = ip_reass_lookup(ip, &hash);
/* Make sure the TOS matches previous fragments. */
if (fp && fp->ipq_tos != ip->ip_tos) {
IP_STATINC(IP_STAT_BADFRAGS);
ip_reass_unlock();
goto bad;
}
/*
@ -994,21 +861,19 @@ ours:
* attempt reassembly; if it succeeds, proceed.
*/
if (mff || ip->ip_off != htons(0)) {
IP_STATINC(IP_STAT_FRAGMENTS);
s = splvm();
ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
splx(s);
struct ipqent *ipqe;
ipqe = ip_reass_getent();
if (ipqe == NULL) {
IP_STATINC(IP_STAT_RCVMEMDROP);
IPQ_UNLOCK();
ip_reass_unlock();
goto bad;
}
ipqe->ipqe_mff = mff;
ipqe->ipqe_m = m;
ipqe->ipqe_ip = ip;
m = ip_reass(ipqe, fp, &ipq[hash]);
m = ip_reass(ipqe, fp, hash);
if (m == NULL) {
IPQ_UNLOCK();
return;
}
IP_STATINC(IP_STAT_REASSEMBLED);
@ -1017,8 +882,8 @@ ours:
ip->ip_len = htons(ntohs(ip->ip_len) + hlen);
} else if (fp) {
ip_freef(fp);
ip_reass_unlock();
}
IPQ_UNLOCK();
}
#if defined(IPSEC)
@ -1096,398 +961,30 @@ badcsum:
}
/*
* Take incoming datagram fragment and try to
* reassemble it into whole datagram. If a chain for
* reassembly of this datagram already exists, then it
* is given as fp; otherwise have to make a chain.
*/
struct mbuf *
ip_reass(struct ipqent *ipqe, struct ipq *fp, struct ipqhead *ipqhead)
{
struct mbuf *m = ipqe->ipqe_m;
struct ipqent *nq, *p, *q;
struct ip *ip;
struct mbuf *t;
int hlen = ipqe->ipqe_ip->ip_hl << 2;
int i, next, s;
IPQ_LOCK_CHECK();
/*
* Presence of header sizes in mbufs
* would confuse code below.
*/
m->m_data += hlen;
m->m_len -= hlen;
#ifdef notyet
/* make sure fragment limit is up-to-date */
CHECK_NMBCLUSTER_PARAMS();
/* If we have too many fragments, drop the older half. */
if (ip_nfrags >= ip_maxfrags)
ip_reass_drophalf(void);
#endif
/*
* We are about to add a fragment; increment frag count.
*/
ip_nfrags++;
/*
* If first fragment to arrive, create a reassembly queue.
*/
if (fp == 0) {
/*
* Enforce upper bound on number of fragmented packets
* for which we attempt reassembly;
* If maxfrag is 0, never accept fragments.
* If maxfrag is -1, accept all fragments without limitation.
*/
if (ip_maxfragpackets < 0)
;
else if (ip_nfragpackets >= ip_maxfragpackets)
goto dropfrag;
ip_nfragpackets++;
fp = malloc(sizeof (struct ipq), M_FTABLE, M_NOWAIT);
if (fp == NULL)
goto dropfrag;
LIST_INSERT_HEAD(ipqhead, fp, ipq_q);
fp->ipq_nfrags = 1;
fp->ipq_ttl = IPFRAGTTL;
fp->ipq_p = ipqe->ipqe_ip->ip_p;
fp->ipq_id = ipqe->ipqe_ip->ip_id;
fp->ipq_tos = ipqe->ipqe_ip->ip_tos;
TAILQ_INIT(&fp->ipq_fragq);
fp->ipq_src = ipqe->ipqe_ip->ip_src;
fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
p = NULL;
goto insert;
} else {
fp->ipq_nfrags++;
}
/*
* Find a segment which begins after this one does.
*/
for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
p = q, q = TAILQ_NEXT(q, ipqe_q))
if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
break;
/*
* If there is a preceding segment, it may provide some of
* our data already. If so, drop the data from the incoming
* segment. If it provides all of our data, drop us.
*/
if (p != NULL) {
i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
ntohs(ipqe->ipqe_ip->ip_off);
if (i > 0) {
if (i >= ntohs(ipqe->ipqe_ip->ip_len))
goto dropfrag;
m_adj(ipqe->ipqe_m, i);
ipqe->ipqe_ip->ip_off =
htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
ipqe->ipqe_ip->ip_len =
htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
}
}
/*
* While we overlap succeeding segments trim them or,
* if they are completely covered, dequeue them.
*/
for (; q != NULL &&
ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
ntohs(q->ipqe_ip->ip_off); q = nq) {
i = (ntohs(ipqe->ipqe_ip->ip_off) +
ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
if (i < ntohs(q->ipqe_ip->ip_len)) {
q->ipqe_ip->ip_len =
htons(ntohs(q->ipqe_ip->ip_len) - i);
q->ipqe_ip->ip_off =
htons(ntohs(q->ipqe_ip->ip_off) + i);
m_adj(q->ipqe_m, i);
break;
}
nq = TAILQ_NEXT(q, ipqe_q);
m_freem(q->ipqe_m);
TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
s = splvm();
pool_put(&ipqent_pool, q);
splx(s);
fp->ipq_nfrags--;
ip_nfrags--;
}
insert:
/*
* Stick new segment in its place;
* check for complete reassembly.
*/
if (p == NULL) {
TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
} else {
TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q);
}
next = 0;
for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
p = q, q = TAILQ_NEXT(q, ipqe_q)) {
if (ntohs(q->ipqe_ip->ip_off) != next)
return (0);
next += ntohs(q->ipqe_ip->ip_len);
}
if (p->ipqe_mff)
return (0);
/*
* Reassembly is complete. Check for a bogus message size and
* concatenate fragments.
*/
q = TAILQ_FIRST(&fp->ipq_fragq);
ip = q->ipqe_ip;
if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
IP_STATINC(IP_STAT_TOOLONG);
ip_freef(fp);
return (0);
}
m = q->ipqe_m;
t = m->m_next;
m->m_next = 0;
m_cat(m, t);
nq = TAILQ_NEXT(q, ipqe_q);
s = splvm();
pool_put(&ipqent_pool, q);
splx(s);
for (q = nq; q != NULL; q = nq) {
t = q->ipqe_m;
nq = TAILQ_NEXT(q, ipqe_q);
s = splvm();
pool_put(&ipqent_pool, q);
splx(s);
m_cat(m, t);
}
ip_nfrags -= fp->ipq_nfrags;
/*
* Create header for new ip packet by
* modifying header of first packet;
* dequeue and discard fragment reassembly header.
* Make header visible.
*/
ip->ip_len = htons(next);
ip->ip_src = fp->ipq_src;
ip->ip_dst = fp->ipq_dst;
LIST_REMOVE(fp, ipq_q);
free(fp, M_FTABLE);
ip_nfragpackets--;
m->m_len += (ip->ip_hl << 2);
m->m_data -= (ip->ip_hl << 2);
/* some debugging cruft by sklower, below, will go away soon */
if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
int plen = 0;
for (t = m; t; t = t->m_next)
plen += t->m_len;
m->m_pkthdr.len = plen;
m->m_pkthdr.csum_flags = 0;
}
return (m);
dropfrag:
if (fp != 0)
fp->ipq_nfrags--;
ip_nfrags--;
IP_STATINC(IP_STAT_FRAGDROPPED);
m_freem(m);
s = splvm();
pool_put(&ipqent_pool, ipqe);
splx(s);
return (0);
}
/*
* Free a fragment reassembly header and all
* associated datagrams.
*/
void
ip_freef(struct ipq *fp)
{
struct ipqent *q, *p;
u_int nfrags = 0;
int s;
IPQ_LOCK_CHECK();
for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) {
p = TAILQ_NEXT(q, ipqe_q);
m_freem(q->ipqe_m);
nfrags++;
TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
s = splvm();
pool_put(&ipqent_pool, q);
splx(s);
}
if (nfrags != fp->ipq_nfrags)
printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags);
ip_nfrags -= nfrags;
LIST_REMOVE(fp, ipq_q);
free(fp, M_FTABLE);
ip_nfragpackets--;
}
/*
* IP reassembly TTL machinery for multiplicative drop.
*/
static u_int fragttl_histo[(IPFRAGTTL+1)];
/*
* Decrement TTL of all reasembly queue entries by `ticks'.
* Count number of distinct fragments (as opposed to partial, fragmented
* datagrams) in the reassembly queue. While we traverse the entire
* reassembly queue, compute and return the median TTL over all fragments.
*/
static u_int
ip_reass_ttl_decr(u_int ticks)
{
u_int nfrags, median, dropfraction, keepfraction;
struct ipq *fp, *nfp;
int i;
nfrags = 0;
memset(fragttl_histo, 0, sizeof fragttl_histo);
for (i = 0; i < IPREASS_NHASH; i++) {
for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ?
0 : fp->ipq_ttl - ticks);
nfp = LIST_NEXT(fp, ipq_q);
if (fp->ipq_ttl == 0) {
IP_STATINC(IP_STAT_FRAGTIMEOUT);
ip_freef(fp);
} else {
nfrags += fp->ipq_nfrags;
fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
}
}
}
KASSERT(ip_nfrags == nfrags);
/* Find median (or other drop fraction) in histogram. */
dropfraction = (ip_nfrags / 2);
keepfraction = ip_nfrags - dropfraction;
for (i = IPFRAGTTL, median = 0; i >= 0; i--) {
median += fragttl_histo[i];
if (median >= keepfraction)
break;
}
/* Return TTL of median (or other fraction). */
return (u_int)i;
}
void
ip_reass_drophalf(void)
{
u_int median_ticks;
/*
* Compute median TTL of all fragments, and count frags
* with that TTL or lower (roughly half of all fragments).
*/
median_ticks = ip_reass_ttl_decr(0);
/* Drop half. */
median_ticks = ip_reass_ttl_decr(median_ticks);
}
/*
* IP timer processing;
* if a timer expires on a reassembly
* queue, discard it.
* IP timer processing.
*/
void
ip_slowtimo(void)
{
static u_int dropscanidx = 0;
u_int i;
u_int median_ttl;
mutex_enter(softnet_lock);
KERNEL_LOCK(1, NULL);
IPQ_LOCK();
/* Age TTL of all fragments by 1 tick .*/
median_ttl = ip_reass_ttl_decr(1);
/* make sure fragment limit is up-to-date */
CHECK_NMBCLUSTER_PARAMS();
/* If we have too many fragments, drop the older half. */
if (ip_nfrags > ip_maxfrags)
ip_reass_ttl_decr(median_ttl);
/*
* If we are over the maximum number of fragmented packets
* (due to the limit being lowered), drain off
* enough to get down to the new limit. Start draining
* from the reassembly hashqueue most recently drained.
*/
if (ip_maxfragpackets < 0)
;
else {
int wrapped = 0;
i = dropscanidx;
while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) {
while (LIST_FIRST(&ipq[i]) != NULL)
ip_freef(LIST_FIRST(&ipq[i]));
if (++i >= IPREASS_NHASH) {
i = 0;
}
/*
* Dont scan forever even if fragment counters are
* wrong: stop after scanning entire reassembly queue.
*/
if (i == dropscanidx)
wrapped = 1;
}
dropscanidx = i;
}
IPQ_UNLOCK();
ip_reass_slowtimo();
KERNEL_UNLOCK_ONE(NULL);
mutex_exit(softnet_lock);
}
/*
* Drain off all datagram fragments. Don't acquire softnet_lock as
* can be called from hardware interrupt context.
* IP drain processing.
*/
void
ip_drain(void)
{
KERNEL_LOCK(1, NULL);
/*
* We may be called from a device's interrupt context. If
* the ipq is already busy, just bail out now.
*/
if (ipq_lock_try() != 0) {
/*
* Drop half the total fragments now. If more mbufs are
* needed, we will be called again soon.
*/
ip_reass_drophalf();
IPQ_UNLOCK();
}
ip_reass_drain();
KERNEL_UNLOCK_ONE(NULL);
}
@ -2430,14 +1927,6 @@ sysctl_net_inet_ip_setup(struct sysctllog **clog)
CTL_NET, PF_INET, IPPROTO_IP,
IPCTL_LOWPORTMAX, CTL_EOL);
#endif /* IPNOPRIVPORTS */
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxfragpackets",
SYSCTL_DESCR("Maximum number of fragments to retain for "
"possible reassembly"),
NULL, 0, &ip_maxfragpackets, 0,
CTL_NET, PF_INET, IPPROTO_IP,
IPCTL_MAXFRAGPACKETS, CTL_EOL);
#if NGRE > 0
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,

677
sys/netinet/ip_reass.c Normal file
View File

@ -0,0 +1,677 @@
/* $NetBSD: ip_reass.c,v 1.1 2010/07/13 22:16:10 rmind Exp $ */
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)ip_input.c 8.2 (Berkeley) 1/4/94
*/
/*
* IP reassembly.
*
* Additive-Increase/Multiplicative-Decrease (AIMD) strategy for IP
* reassembly queue buffer managment.
*
* We keep a count of total IP fragments (NB: not fragmented packets),
* awaiting reassembly (ip_nfrags) and a limit (ip_maxfrags) on fragments.
* If ip_nfrags exceeds ip_maxfrags the limit, we drop half the total
* fragments in reassembly queues. This AIMD policy avoids repeatedly
* deleting single packets under heavy fragmentation load (e.g., from lossy
* NFS peers).
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip_reass.c,v 1.1 2010/07/13 22:16:10 rmind Exp $");
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/pool.h>
#include <sys/sysctl.h>
#include <net/if.h>
#include <net/route.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/in_proto.h>
#include <netinet/ip_private.h>
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
/*
* IP datagram reassembly hashed queues, pool, lock and counters.
*/
#define IPREASS_HASH_SHIFT 6
#define IPREASS_HASH_SIZE (1 << IPREASS_HASH_SHIFT)
#define IPREASS_HASH_MASK (IPREASS_HASH_SIZE - 1)
#define IPREASS_HASH(x, y) \
(((((x) & 0xf) | ((((x) >> 8) & 0xf) << 4)) ^ (y)) & IPREASS_HASH_MASK)
struct ipqhead ipq[IPREASS_HASH_SIZE];
struct pool ipqent_pool;
static int ipq_locked;
static int ip_nfragpackets; /* packets in reass queue */
static int ip_nfrags; /* total fragments in reass queues */
static int ip_maxfragpackets; /* limit on packets. XXX sysctl */
static int ip_maxfrags; /* limit on fragments. XXX sysctl */
/*
* Cached copy of nmbclusters. If nbclusters is different,
* recalculate IP parameters derived from nmbclusters.
*/
static int ip_nmbclusters; /* copy of nmbclusters */
/*
* IP reassembly TTL machinery for multiplicative drop.
*/
static u_int fragttl_histo[IPFRAGTTL + 1];
void sysctl_ip_reass_setup(void);
static void ip_nmbclusters_changed(void);
static u_int ip_reass_ttl_decr(u_int ticks);
static void ip_reass_drophalf(void);
/*
* ip_reass_init:
*
* Initialization of IP reassembly mechanism.
*/
void
ip_reass_init(void)
{
int i;
pool_init(&ipqent_pool, sizeof(struct ipqent), 0, 0, 0, "ipqepl",
NULL, IPL_VM);
for (i = 0; i < IPREASS_HASH_SIZE; i++) {
LIST_INIT(&ipq[i]);
}
ip_maxfragpackets = 200;
ip_maxfrags = 0;
ip_nmbclusters_changed();
sysctl_ip_reass_setup();
}
static struct sysctllog *ip_reass_sysctllog;
void
sysctl_ip_reass_setup(void)
{
sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "net", NULL,
NULL, 0, NULL, 0,
CTL_NET, CTL_EOL);
sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet",
SYSCTL_DESCR("PF_INET related settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET, CTL_EOL);
sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "ip",
SYSCTL_DESCR("IPv4 related settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET, IPPROTO_IP, CTL_EOL);
sysctl_createv(&ip_reass_sysctllog, 0, NULL, NULL,
CTLFLAG_PERMANENT|CTLFLAG_READWRITE,
CTLTYPE_INT, "maxfragpackets",
SYSCTL_DESCR("Maximum number of fragments to retain for "
"possible reassembly"),
NULL, 0, &ip_maxfragpackets, 0,
CTL_NET, PF_INET, IPPROTO_IP, IPCTL_MAXFRAGPACKETS, CTL_EOL);
}
#define CHECK_NMBCLUSTER_PARAMS() \
do { \
if (__predict_false(ip_nmbclusters != nmbclusters)) \
ip_nmbclusters_changed(); \
} while (/*CONSTCOND*/0)
/*
* Compute IP limits derived from the value of nmbclusters.
*/
static void
ip_nmbclusters_changed(void)
{
ip_maxfrags = nmbclusters / 4;
ip_nmbclusters = nmbclusters;
}
static inline int ipq_lock_try(void);
static inline void ipq_unlock(void);
static inline int
ipq_lock_try(void)
{
int s;
/*
* Use splvm() -- we're blocking things that would cause
* mbuf allocation.
*/
s = splvm();
if (ipq_locked) {
splx(s);
return (0);
}
ipq_locked = 1;
splx(s);
return (1);
}
static inline void
ipq_unlock(void)
{
int s;
s = splvm();
ipq_locked = 0;
splx(s);
}
#ifdef DIAGNOSTIC
#define IPQ_LOCK() \
do { \
if (ipq_lock_try() == 0) { \
printf("%s:%d: ipq already locked\n", __FILE__, __LINE__); \
panic("ipq_lock"); \
} \
} while (/*CONSTCOND*/ 0)
#define IPQ_LOCK_CHECK() \
do { \
if (ipq_locked == 0) { \
printf("%s:%d: ipq lock not held\n", __FILE__, __LINE__); \
panic("ipq lock check"); \
} \
} while (/*CONSTCOND*/ 0)
#else
#define IPQ_LOCK() (void) ipq_lock_try()
#define IPQ_LOCK_CHECK() /* nothing */
#endif
#define IPQ_UNLOCK() ipq_unlock()
/*
* ip_reass_lookup:
*
* Look for queue of fragments of this datagram.
*/
struct ipq *
ip_reass_lookup(struct ip *ip, u_int *hashp)
{
struct ipq *fp;
u_int hash;
IPQ_LOCK();
hash = IPREASS_HASH(ip->ip_src.s_addr, ip->ip_id);
LIST_FOREACH(fp, &ipq[hash], ipq_q) {
if (ip->ip_id != fp->ipq_id)
continue;
if (!in_hosteq(ip->ip_src, fp->ipq_src))
continue;
if (!in_hosteq(ip->ip_dst, fp->ipq_dst))
continue;
if (ip->ip_p != fp->ipq_p)
continue;
break;
}
*hashp = hash;
return fp;
}
void
ip_reass_unlock(void)
{
IPQ_UNLOCK();
}
struct ipqent *
ip_reass_getent(void)
{
struct ipqent *ipqe;
int s;
IP_STATINC(IP_STAT_FRAGMENTS);
s = splvm();
ipqe = pool_get(&ipqent_pool, PR_NOWAIT);
splx(s);
return ipqe;
}
/*
* ip_reass:
*
* Take incoming datagram fragment and try to reassemble it into whole
* datagram. If a chain for reassembly of this datagram already exists,
* then it is given as 'fp'; otherwise have to make a chain.
*/
struct mbuf *
ip_reass(struct ipqent *ipqe, struct ipq *fp, u_int hash)
{
struct ipqhead *ipqhead = &ipq[hash];
const int hlen = ipqe->ipqe_ip->ip_hl << 2;
struct mbuf *m = ipqe->ipqe_m, *t;
struct ipqent *nq, *p, *q;
struct ip *ip;
int i, next, s;
IPQ_LOCK_CHECK();
/*
* Presence of header sizes in mbufs would confuse code below.
*/
m->m_data += hlen;
m->m_len -= hlen;
#ifdef notyet
/* Make sure fragment limit is up-to-date. */
CHECK_NMBCLUSTER_PARAMS();
/* If we have too many fragments, drop the older half. */
if (ip_nfrags >= ip_maxfrags) {
ip_reass_drophalf(void);
}
#endif
/*
* We are about to add a fragment; increment frag count.
*/
ip_nfrags++;
/*
* If first fragment to arrive, create a reassembly queue.
*/
if (fp == NULL) {
/*
* Enforce upper bound on number of fragmented packets
* for which we attempt reassembly: a) if maxfrag is 0,
* never accept fragments b) if maxfrag is -1, accept
* all fragments without limitation.
*/
if (ip_maxfragpackets < 0)
;
else if (ip_nfragpackets >= ip_maxfragpackets) {
goto dropfrag;
}
ip_nfragpackets++;
fp = malloc(sizeof(struct ipq), M_FTABLE, M_NOWAIT);
if (fp == NULL) {
goto dropfrag;
}
LIST_INSERT_HEAD(ipqhead, fp, ipq_q);
fp->ipq_nfrags = 1;
fp->ipq_ttl = IPFRAGTTL;
fp->ipq_p = ipqe->ipqe_ip->ip_p;
fp->ipq_id = ipqe->ipqe_ip->ip_id;
fp->ipq_tos = ipqe->ipqe_ip->ip_tos;
TAILQ_INIT(&fp->ipq_fragq);
fp->ipq_src = ipqe->ipqe_ip->ip_src;
fp->ipq_dst = ipqe->ipqe_ip->ip_dst;
p = NULL;
goto insert;
} else {
fp->ipq_nfrags++;
}
/*
* Find a segment which begins after this one does.
*/
for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
p = q, q = TAILQ_NEXT(q, ipqe_q))
if (ntohs(q->ipqe_ip->ip_off) > ntohs(ipqe->ipqe_ip->ip_off))
break;
/*
* If there is a preceding segment, it may provide some of our
* data already. If so, drop the data from the incoming segment.
* If it provides all of our data, drop us.
*/
if (p != NULL) {
i = ntohs(p->ipqe_ip->ip_off) + ntohs(p->ipqe_ip->ip_len) -
ntohs(ipqe->ipqe_ip->ip_off);
if (i > 0) {
if (i >= ntohs(ipqe->ipqe_ip->ip_len)) {
goto dropfrag;
}
m_adj(ipqe->ipqe_m, i);
ipqe->ipqe_ip->ip_off =
htons(ntohs(ipqe->ipqe_ip->ip_off) + i);
ipqe->ipqe_ip->ip_len =
htons(ntohs(ipqe->ipqe_ip->ip_len) - i);
}
}
/*
* While we overlap succeeding segments trim them or, if they are
* completely covered, dequeue them.
*/
for (; q != NULL &&
ntohs(ipqe->ipqe_ip->ip_off) + ntohs(ipqe->ipqe_ip->ip_len) >
ntohs(q->ipqe_ip->ip_off); q = nq) {
i = (ntohs(ipqe->ipqe_ip->ip_off) +
ntohs(ipqe->ipqe_ip->ip_len)) - ntohs(q->ipqe_ip->ip_off);
if (i < ntohs(q->ipqe_ip->ip_len)) {
q->ipqe_ip->ip_len =
htons(ntohs(q->ipqe_ip->ip_len) - i);
q->ipqe_ip->ip_off =
htons(ntohs(q->ipqe_ip->ip_off) + i);
m_adj(q->ipqe_m, i);
break;
}
nq = TAILQ_NEXT(q, ipqe_q);
m_freem(q->ipqe_m);
TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
s = splvm();
pool_put(&ipqent_pool, q);
splx(s);
fp->ipq_nfrags--;
ip_nfrags--;
}
insert:
/*
* Stick new segment in its place; check for complete reassembly.
*/
if (p == NULL) {
TAILQ_INSERT_HEAD(&fp->ipq_fragq, ipqe, ipqe_q);
} else {
TAILQ_INSERT_AFTER(&fp->ipq_fragq, p, ipqe, ipqe_q);
}
next = 0;
for (p = NULL, q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL;
p = q, q = TAILQ_NEXT(q, ipqe_q)) {
if (ntohs(q->ipqe_ip->ip_off) != next) {
IPQ_UNLOCK();
return NULL;
}
next += ntohs(q->ipqe_ip->ip_len);
}
if (p->ipqe_mff) {
IPQ_UNLOCK();
return NULL;
}
/*
* Reassembly is complete. Check for a bogus message size and
* concatenate fragments.
*/
q = TAILQ_FIRST(&fp->ipq_fragq);
ip = q->ipqe_ip;
if ((next + (ip->ip_hl << 2)) > IP_MAXPACKET) {
IP_STATINC(IP_STAT_TOOLONG);
ip_freef(fp);
IPQ_UNLOCK();
return NULL;
}
m = q->ipqe_m;
t = m->m_next;
m->m_next = NULL;
m_cat(m, t);
nq = TAILQ_NEXT(q, ipqe_q);
s = splvm();
pool_put(&ipqent_pool, q);
splx(s);
for (q = nq; q != NULL; q = nq) {
t = q->ipqe_m;
nq = TAILQ_NEXT(q, ipqe_q);
s = splvm();
pool_put(&ipqent_pool, q);
splx(s);
m_cat(m, t);
}
ip_nfrags -= fp->ipq_nfrags;
/*
* Create header for new packet by modifying header of first
* packet. Dequeue and discard fragment reassembly header. Make
* header visible.
*/
ip->ip_len = htons(next);
ip->ip_src = fp->ipq_src;
ip->ip_dst = fp->ipq_dst;
LIST_REMOVE(fp, ipq_q);
free(fp, M_FTABLE);
ip_nfragpackets--;
m->m_len += (ip->ip_hl << 2);
m->m_data -= (ip->ip_hl << 2);
/* some debugging cruft by sklower, below, will go away soon */
if (m->m_flags & M_PKTHDR) { /* XXX this should be done elsewhere */
int plen = 0;
for (t = m; t; t = t->m_next) {
plen += t->m_len;
}
m->m_pkthdr.len = plen;
m->m_pkthdr.csum_flags = 0;
}
IPQ_UNLOCK();
return m;
dropfrag:
if (fp != NULL) {
fp->ipq_nfrags--;
}
ip_nfrags--;
IP_STATINC(IP_STAT_FRAGDROPPED);
m_freem(m);
s = splvm();
pool_put(&ipqent_pool, ipqe);
splx(s);
IPQ_UNLOCK();
return NULL;
}
/*
* ip_freef:
*
* Free a fragment reassembly header and all associated datagrams.
*/
void
ip_freef(struct ipq *fp)
{
struct ipqent *q, *p;
u_int nfrags = 0;
int s;
IPQ_LOCK_CHECK();
for (q = TAILQ_FIRST(&fp->ipq_fragq); q != NULL; q = p) {
p = TAILQ_NEXT(q, ipqe_q);
m_freem(q->ipqe_m);
nfrags++;
TAILQ_REMOVE(&fp->ipq_fragq, q, ipqe_q);
s = splvm();
pool_put(&ipqent_pool, q);
splx(s);
}
if (nfrags != fp->ipq_nfrags) {
printf("ip_freef: nfrags %d != %d\n", fp->ipq_nfrags, nfrags);
}
ip_nfrags -= nfrags;
LIST_REMOVE(fp, ipq_q);
free(fp, M_FTABLE);
ip_nfragpackets--;
}
/*
* ip_reass_ttl_decr:
*
* Decrement TTL of all reasembly queue entries by `ticks'. Count
* number of distinct fragments (as opposed to partial, fragmented
* datagrams) inthe reassembly queue. While we traverse the entire
* reassembly queue, compute and return the median TTL over all
* fragments.
*/
static u_int
ip_reass_ttl_decr(u_int ticks)
{
u_int nfrags, median, dropfraction, keepfraction;
struct ipq *fp, *nfp;
int i;
nfrags = 0;
memset(fragttl_histo, 0, sizeof(fragttl_histo));
for (i = 0; i < IPREASS_HASH_SIZE; i++) {
for (fp = LIST_FIRST(&ipq[i]); fp != NULL; fp = nfp) {
fp->ipq_ttl = ((fp->ipq_ttl <= ticks) ?
0 : fp->ipq_ttl - ticks);
nfp = LIST_NEXT(fp, ipq_q);
if (fp->ipq_ttl == 0) {
IP_STATINC(IP_STAT_FRAGTIMEOUT);
ip_freef(fp);
} else {
nfrags += fp->ipq_nfrags;
fragttl_histo[fp->ipq_ttl] += fp->ipq_nfrags;
}
}
}
KASSERT(ip_nfrags == nfrags);
/* Find median (or other drop fraction) in histogram. */
dropfraction = (ip_nfrags / 2);
keepfraction = ip_nfrags - dropfraction;
for (i = IPFRAGTTL, median = 0; i >= 0; i--) {
median += fragttl_histo[i];
if (median >= keepfraction)
break;
}
/* Return TTL of median (or other fraction). */
return (u_int)i;
}
static void
ip_reass_drophalf(void)
{
u_int median_ticks;
/*
* Compute median TTL of all fragments, and count frags
* with that TTL or lower (roughly half of all fragments).
*/
median_ticks = ip_reass_ttl_decr(0);
/* Drop half. */
median_ticks = ip_reass_ttl_decr(median_ticks);
}
/*
* ip_reass_drain: drain off all datagram fragments. Do not acquire
* softnet_lock as can be called from hardware interrupt context.
*/
void
ip_reass_drain(void)
{
/*
* We may be called from a device's interrupt context. If
* the ipq is already busy, just bail out now.
*/
if (ipq_lock_try() != 0) {
/*
* Drop half the total fragments now. If more mbufs are
* needed, we will be called again soon.
*/
ip_reass_drophalf();
IPQ_UNLOCK();
}
}
/*
* ip_reass_slowtimo:
*
* If a timer expires on a reassembly queue, discard it.
*/
void
ip_reass_slowtimo(void)
{
static u_int dropscanidx = 0;
u_int i, median_ttl;
IPQ_LOCK();
/* Age TTL of all fragments by 1 tick .*/
median_ttl = ip_reass_ttl_decr(1);
/* Make sure fragment limit is up-to-date. */
CHECK_NMBCLUSTER_PARAMS();
/* If we have too many fragments, drop the older half. */
if (ip_nfrags > ip_maxfrags) {
ip_reass_ttl_decr(median_ttl);
}
/*
* If we are over the maximum number of fragmented packets (due to
* the limit being lowered), drain off enough to get down to the
* new limit. Start draining from the reassembly hashqueue most
* recently drained.
*/
if (ip_maxfragpackets < 0)
;
else {
int wrapped = 0;
i = dropscanidx;
while (ip_nfragpackets > ip_maxfragpackets && wrapped == 0) {
while (LIST_FIRST(&ipq[i]) != NULL) {
ip_freef(LIST_FIRST(&ipq[i]));
}
if (++i >= IPREASS_HASH_SIZE) {
i = 0;
}
/*
* Do not scan forever even if fragment counters are
* wrong: stop after scanning entire reassembly queue.
*/
if (i == dropscanidx) {
wrapped = 1;
}
}
dropscanidx = i;
}
IPQ_UNLOCK();
}

View File

@ -1,4 +1,4 @@
/* $NetBSD: ip_var.h,v 1.91 2009/02/01 17:04:11 pooka Exp $ */
/* $NetBSD: ip_var.h,v 1.92 2010/07/13 22:16:10 rmind Exp $ */
/*
* Copyright (c) 1982, 1986, 1993
@ -198,7 +198,6 @@ extern int ip_maxflows;
extern int ip_hashsize;
#endif
extern struct pool inmulti_pool;
extern struct pool ipqent_pool;
struct inpcb;
struct sockopt;
@ -206,7 +205,6 @@ int ip_ctloutput(int, struct socket *, struct sockopt *);
int ip_dooptions(struct mbuf *);
void ip_drain(void);
void ip_forward(struct mbuf *, int);
void ip_freef(struct ipq *);
void ip_freemoptions(struct ip_moptions *);
int ip_getmoptions(struct ip_moptions *, struct sockopt *);
void ip_init(void);
@ -215,8 +213,18 @@ u_int ip_optlen(struct inpcb *);
int ip_output(struct mbuf *, ...);
int ip_fragment(struct mbuf *, struct ifnet *, u_long);
int ip_pcbopts(struct mbuf **, const struct sockopt *);
struct ipq *
ip_reass_lookup(struct ip *, u_int *);
void ip_reass_unlock(void);
struct ipqent *
ip_reass_getent(void);
struct mbuf *
ip_reass(struct ipqent *, struct ipq *, struct ipqhead *);
ip_reass(struct ipqent *, struct ipq *, u_int);
void ip_reass_slowtimo(void);
void ip_reass_drain(void);
void ip_freef(struct ipq *);
struct in_ifaddr *
ip_rtaddr(struct in_addr);
void ip_savecontrol(struct inpcb *, struct mbuf **, struct ip *,

View File

@ -1,12 +1,12 @@
# $NetBSD: Makefile.inc,v 1.5 2010/02/16 20:42:47 pooka Exp $
# $NetBSD: Makefile.inc,v 1.6 2010/07/13 22:16:10 rmind Exp $
#
.PATH: ${.CURDIR}/../../../../netinet ${.CURDIR}/../../../../netinet6
# INET
SRCS+= in_proto.c igmp.c in.c in_offload.c in_pcb.c ip_icmp.c \
ip_flow.c ip_id.c ip_input.c ip_output.c raw_ip.c in_cksum.c \
cpu_in_cksum.c in4_cksum.c ip_encap.c
ip_flow.c ip_id.c ip_input.c ip_reass.c ip_output.c raw_ip.c \
in_cksum.c cpu_in_cksum.c in4_cksum.c ip_encap.c
# INET6
SRCS+= dest6.c frag6.c icmp6.c in6.c in6_cksum.c in6_ifattach.c \