Work in progress: use a raw socket for GRE in IP encapsulation

instead of adding/subtracting our own IPv4 header.

There are many benefits:  gre(4) needn't grok the outer encapsulation
header any longer, so this simplifies the gre(4) code.  The IP
stack needn't grok GRE, so it is simplified, too.  gre(4) will
benefit from optimizations in the socket code.  Eventually, gre(4)
will gain an IPv6 encapsulation with very few new lines of code.

There is a small performance loss.  A 133 MHz, 486-class AMD Elan
sinks/sources a TCP stream over GRE with about 93% the throughput
of the old code.  TCP throughput on a 266 MHz, 586-class AMD Geode
is about 96% the throughput of the old code.  A 175-MHz ADM5120
(MIPS) only sinks a TCP stream over GRE at about 90% of the old
code; I am still investigating that.

I produced stripped-down versions of sosend() and soreceive() for
gre(4) to use.  They are guaranteed not to block, so they can be
called from a software interrupt and from a socket upcall,
respectively.

A kernel thread is no longer necessary for socket transmit/receive,
but I didn't get around to removing it, yet.

Thanks to Matt Thomas for suggesting the use of stripped-down socket
code and software interrupts, and to Andrew Doran for advice and
answers concerning software interrupts, threads, and performance.
This commit is contained in:
dyoung 2007-10-05 03:28:12 +00:00
parent c3d21ddcd4
commit 60149b1ce8
7 changed files with 1031 additions and 1014 deletions

View File

@ -1,4 +1,4 @@
# $NetBSD: mi,v 1.1069 2007/09/30 13:10:52 kiyohara Exp $
# $NetBSD: mi,v 1.1070 2007/10/05 03:28:13 dyoung Exp $
./etc/mtree/set.comp comp-sys-root
./usr/bin/addr2line comp-debug-bin bfd
./usr/bin/ar comp-util-bin bfd
@ -1400,7 +1400,7 @@
./usr/include/netinet/ip_encap.h comp-c-include
./usr/include/netinet/ip_fil.h comp-ipf-include ipfilter
./usr/include/netinet/ip_frag.h comp-ipf-include ipfilter
./usr/include/netinet/ip_gre.h comp-c-include
./usr/include/netinet/ip_gre.h comp-obsolete obsolete
./usr/include/netinet/ip_htable.h comp-ipf-include ipfilter
./usr/include/netinet/ip_icmp.h comp-c-include
./usr/include/netinet/ip_lookup.h comp-ipf-include ipfilter

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,4 @@
/* $NetBSD: if_gre.h,v 1.26 2007/09/02 01:49:49 dyoung Exp $ */
/* $NetBSD: if_gre.h,v 1.27 2007/10/05 03:28:12 dyoung Exp $ */
/*
* Copyright (c) 1998 The NetBSD Foundation, Inc.
@ -42,44 +42,75 @@
#include <sys/queue.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/malloc.h>
#include <sys/mallocvar.h>
#ifdef _KERNEL
struct gre_soparm {
struct in_addr sp_src; /* source address of gre packets */
struct in_addr sp_dst; /* destination address of gre packets */
in_port_t sp_srcport; /* source port of gre packets */
in_port_t sp_dstport; /* destination port of gre packets */
struct file *sp_fp;
struct sockaddr_storage sp_src; /* source of gre packets */
struct sockaddr_storage sp_dst; /* destination of gre packets */
int sp_type; /* encapsulating socket type */
int sp_proto; /* encapsulating protocol */
int sp_fd;
int sp_bysock; /* encapsulation configured by passing
* socket, not by SIOCSLIFPHYADDR
*/
};
enum gre_state {
GRE_S_IDLE = 0
, GRE_S_CONF
, GRE_S_WORK
, GRE_S_KCONF
, GRE_S_DIE
, GRE_S_DEAD
};
#define __cacheline_aligned __attribute__((__aligned__(CACHE_LINE_SIZE)))
struct gre_bufq {
volatile int bq_prodidx;
volatile int bq_considx;
size_t bq_len __cacheline_aligned;
size_t bq_lenmask;
volatile int bq_drops;
struct mbuf **bq_buf;
};
MALLOC_DECLARE(M_GRE_BUFQ);
struct gre_softc {
struct ifnet sc_if;
kmutex_t sc_mtx;
kcondvar_t sc_soparm_cv;
kcondvar_t sc_join_cv;
kcondvar_t sc_work_cv;
int sc_haswork;
int sc_running;
int sc_dying;
struct ifqueue sc_snd;
kcondvar_t sc_condvar;
struct gre_bufq sc_snd;
struct gre_soparm sc_soparm;
struct gre_soparm sc_newsoparm;
LIST_ENTRY(gre_softc) sc_list;
struct route route; /* routing entry that determines where a
encapsulated packet should go */
int sc_proto; /* protocol of encapsulator */
struct uio sc_uio;
struct lwp *sc_lwp;
volatile enum gre_state sc_state;
volatile int sc_waiters;
volatile int sc_upcalls;
void *sc_si;
struct socket *sc_so;
struct evcnt sc_recv_ev;
struct evcnt sc_send_ev;
struct evcnt sc_wakeup_ev;
struct evcnt sc_block_ev;
struct evcnt sc_error_ev;
struct evcnt sc_pullup_ev;
struct evcnt sc_unsupp_ev;
struct evcnt sc_supcall_ev;
struct evcnt sc_rupcall_ev;
struct evcnt sc_oflow_ev;
};
#define sc_src sc_newsoparm.sp_src
#define sc_srcport sc_newsoparm.sp_srcport
#define sc_dst sc_newsoparm.sp_dst
#define sc_dstport sc_newsoparm.sp_dstport
struct gre_h {
u_int16_t flags; /* GRE flags */
u_int16_t ptype; /* protocol type of payload typically
Ether protocol type*/
uint16_t flags; /* GRE flags */
uint16_t ptype; /* protocol type of payload typically
* ethernet protocol type
*/
/*
* from here on: fields are optional, presence indicated by flags
*
@ -101,18 +132,6 @@ struct gre_h {
*/
} __attribute__((__packed__));
struct greip {
struct ip gi_i;
struct gre_h gi_g;
} __attribute__((__packed__));
#define gi_pr gi_i.ip_p
#define gi_len gi_i.ip_len
#define gi_src gi_i.ip_src
#define gi_dst gi_i.ip_dst
#define gi_ptype gi_g.ptype
#define gi_flags gi_g.flags
#define GRE_CP 0x8000 /* Checksum Present */
#define GRE_RP 0x4000 /* Routing Present */
#define GRE_KP 0x2000 /* Key Present */
@ -131,25 +150,6 @@ struct gre_sre {
u_char *sre_rtinfo; /* the routing information */
};
/* for mobile encaps */
struct mobile_h {
u_int16_t proto; /* protocol and S-bit */
u_int16_t hcrc; /* header checksum */
u_int32_t odst; /* original destination address */
u_int32_t osrc; /* original source addr, if S-bit set */
} __attribute__((__packed__));
struct mobip_h {
struct ip mi;
struct mobile_h mh;
} __attribute__((__packed__));
#define MOB_H_SIZ_S (sizeof(struct mobile_h) - sizeof(u_int32_t))
#define MOB_H_SIZ_L (sizeof(struct mobile_h))
#define MOB_H_SBIT 0x0080
#define GRE_TTL 30
extern int ip_gre_ttl;
#endif /* _KERNEL */
@ -167,13 +167,4 @@ extern int ip_gre_ttl;
#define GRESSOCK _IOW('i' , 107, struct ifreq)
#define GREDSOCK _IOW('i' , 108, struct ifreq)
#ifdef _KERNEL
LIST_HEAD(gre_softc_head, gre_softc);
extern struct gre_softc_head gre_softc_list;
u_int16_t gre_in_cksum(u_short *, u_int);
int gre_input3(struct gre_softc *, struct mbuf *, int, const struct gre_h *,
int);
#endif /* _KERNEL */
#endif /* !_NET_IF_GRE_H_ */

View File

@ -1,11 +1,11 @@
# $NetBSD: Makefile,v 1.18 2007/05/02 22:39:03 dyoung Exp $
# $NetBSD: Makefile,v 1.19 2007/10/05 03:28:13 dyoung Exp $
INCSDIR= /usr/include/netinet
INCS= icmp6.h icmp_var.h if_atm.h if_ether.h if_inarp.h igmp.h \
igmp_var.h in.h in_gif.h in_pcb.h in_pcb_hdr.h \
in_selsrc.h in_systm.h \
in_var.h ip.h ip_carp.h ip6.h ip_ecn.h ip_encap.h ip_gre.h \
in_var.h ip.h ip_carp.h ip6.h ip_ecn.h ip_encap.h \
ip_icmp.h ip_mroute.h ip_var.h pim.h pim_var.h \
tcp.h tcp_debug.h tcp_fsm.h tcp_seq.h tcp_timer.h tcp_var.h \
tcpip.h udp.h udp_var.h

View File

@ -1,4 +1,4 @@
/* $NetBSD: in_proto.c,v 1.90 2007/09/19 18:52:55 dyoung Exp $ */
/* $NetBSD: in_proto.c,v 1.91 2007/10/05 03:28:13 dyoung Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
@ -61,7 +61,7 @@
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: in_proto.c,v 1.90 2007/09/19 18:52:55 dyoung Exp $");
__KERNEL_RCSID(0, "$NetBSD: in_proto.c,v 1.91 2007/10/05 03:28:13 dyoung Exp $");
#include "opt_mrouting.h"
#include "opt_eon.h" /* ISO CLNL over IP */
@ -138,11 +138,6 @@ __KERNEL_RCSID(0, "$NetBSD: in_proto.c,v 1.90 2007/09/19 18:52:55 dyoung Exp $")
#include <netiso/eonvar.h>
#endif /* EON */
#include "gre.h"
#if NGRE > 0
#include <netinet/ip_gre.h>
#endif
#include "carp.h"
#if NCARP > 0
#include <netinet/ip_carp.h>
@ -304,28 +299,6 @@ const struct protosw inetsw[] = {
.pr_usrreq = rip_usrreq,
},
#endif /* NCARP > 0 */
#if NGRE > 0
{ .pr_type = SOCK_RAW,
.pr_domain = &inetdomain,
.pr_protocol = IPPROTO_GRE,
.pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
.pr_input = gre_input,
.pr_output = rip_output,
.pr_ctloutput = rip_ctloutput,
.pr_ctlinput = rip_ctlinput,
.pr_usrreq = rip_usrreq,
},
{ .pr_type = SOCK_RAW,
.pr_domain = &inetdomain,
.pr_protocol = IPPROTO_MOBILE,
.pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR,
.pr_input = gre_mobile_input,
.pr_output = rip_output,
.pr_ctloutput = rip_ctloutput,
.pr_ctlinput = rip_ctlinput,
.pr_usrreq = rip_usrreq,
},
#endif /* NGRE > 0 */
{ .pr_type = SOCK_RAW,
.pr_domain = &inetdomain,
.pr_protocol = IPPROTO_IGMP,

View File

@ -1,270 +0,0 @@
/* $NetBSD: ip_gre.c,v 1.47 2007/09/02 01:49:49 dyoung Exp $ */
/*
* Copyright (c) 1998 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Heiko W.Rupp <hwr@pilhuhn.de>
*
* IPv6-over-GRE contributed by Gert Doering <gert@greenie.muc.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
/*
* deencapsulate tunneled packets and send them on
* output half is in net/if_gre.[ch]
* This currently handles IPPROTO_GRE, IPPROTO_MOBILE
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: ip_gre.c,v 1.47 2007/09/02 01:49:49 dyoung Exp $");
#include "gre.h"
#if NGRE > 0
#include "opt_inet.h"
#include "opt_atalk.h"
#include "bpfilter.h"
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/errno.h>
#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/ioctl.h>
#include <sys/syslog.h>
#include <net/bpf.h>
#include <net/ethertypes.h>
#include <net/if.h>
#include <net/netisr.h>
#include <net/route.h>
#include <net/raw_cb.h>
#ifdef INET
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/ip_var.h>
#include <netinet/ip_gre.h>
#else
#error ip_gre input without IP?
#endif
#ifdef NETATALK
#include <netatalk/at.h>
#include <netatalk/at_var.h>
#include <netatalk/at_extern.h>
#endif
/* Needs IP headers. */
#include <net/if_gre.h>
#include <machine/stdarg.h>
#if 1
void gre_inet_ntoa(struct in_addr in); /* XXX */
#endif
struct gre_softc *gre_lookup(struct mbuf *, u_int8_t);
int gre_input2(struct mbuf *, int, u_char);
/*
* De-encapsulate a packet and feed it back through ip input (this
* routine is called whenever IP gets a packet with proto type
* IPPROTO_GRE and a local destination address).
* This really is simple
*/
void
gre_input(struct mbuf *m, ...)
{
int off, ret, proto;
va_list ap;
va_start(ap, m);
off = va_arg(ap, int);
proto = va_arg(ap, int);
va_end(ap);
ret = gre_input2(m, off, proto);
/*
* ret == 0 : packet not processed, meaning that
* no matching tunnel that is up is found.
* we inject it to raw ip socket to see if anyone picks it up.
*/
if (ret == 0)
rip_input(m, off, proto);
}
/*
* decapsulate.
* Does the real work and is called from gre_input() (above)
* returns 0 if packet is not yet processed
* and 1 if it needs no further processing
* proto is the protocol number of the "calling" foo_input()
* routine.
*/
int
gre_input2(struct mbuf *m, int hlen, u_char proto)
{
int rc;
const struct greip *gip;
struct gre_softc *sc;
if ((sc = gre_lookup(m, proto)) == NULL) {
/* No matching tunnel or tunnel is down. */
return 0;
}
if (m->m_len < sizeof(*gip)) {
m = m_pullup(m, sizeof(*gip));
if (m == NULL)
return ENOBUFS;
}
gip = mtod(m, const struct greip *);
rc = gre_input3(sc, m, hlen, &gip->gi_g, 0);
return rc;
}
/*
* input routine for IPPRPOTO_MOBILE
* This is a little bit different from the other modes, as the
* encapsulating header was not prepended, but instead inserted
* between IP header and payload
*/
void
gre_mobile_input(struct mbuf *m, ...)
{
struct ip *ip;
struct mobip_h *mip;
struct ifqueue *ifq;
struct gre_softc *sc;
uint8_t *hdr;
int hlen, s;
va_list ap;
int msiz;
va_start(ap, m);
hlen = va_arg(ap, int);
va_end(ap);
if ((sc = gre_lookup(m, IPPROTO_MOBILE)) == NULL) {
/* No matching tunnel or tunnel is down. */
m_freem(m);
return;
}
if (M_UNWRITABLE(m, sizeof(*mip))) {
m = m_pullup(m, sizeof(*mip));
if (m == NULL)
return;
}
ip = mtod(m, struct ip *);
/* XXX what if there are IP options? */
mip = mtod(m, struct mobip_h *);
sc->sc_if.if_ipackets++;
sc->sc_if.if_ibytes += m->m_pkthdr.len;
if (ntohs(mip->mh.proto) & MOB_H_SBIT) {
msiz = MOB_H_SIZ_L;
mip->mi.ip_src.s_addr = mip->mh.osrc;
} else
msiz = MOB_H_SIZ_S;
if (M_UNWRITABLE(m, hlen + msiz)) {
m = m_pullup(m, hlen + msiz);
if (m == NULL)
return;
ip = mtod(m, struct ip *);
mip = mtod(m, struct mobip_h *);
}
mip->mi.ip_dst.s_addr = mip->mh.odst;
mip->mi.ip_p = (ntohs(mip->mh.proto) >> 8);
if (gre_in_cksum((u_int16_t *)&mip->mh, msiz) != 0) {
m_freem(m);
return;
}
hdr = mtod(m, uint8_t *);
memmove(hdr + hlen, hdr + hlen + msiz, m->m_len - msiz - hlen);
m->m_len -= msiz;
ip->ip_len = htons(ntohs(ip->ip_len) - msiz);
m->m_pkthdr.len -= msiz;
ip->ip_sum = 0;
ip->ip_sum = in_cksum(m, hlen);
#if NBPFILTER > 0
if (sc->sc_if.if_bpf != NULL)
bpf_mtap_af(sc->sc_if.if_bpf, AF_INET, m);
#endif /*NBPFILTER > 0*/
ifq = &ipintrq;
s = splnet(); /* possible */
if (IF_QFULL(ifq)) {
IF_DROP(ifq);
m_freem(m);
} else
IF_ENQUEUE(ifq, m);
splx(s);
}
/*
* Find the gre interface associated with our src/dst/proto set.
*/
struct gre_softc *
gre_lookup(struct mbuf *m, u_int8_t proto)
{
const struct ip *ip = mtod(m, const struct ip *);
struct gre_softc *sc;
LIST_FOREACH(sc, &gre_softc_list, sc_list) {
if (sc->sc_dst.s_addr == ip->ip_src.s_addr &&
sc->sc_src.s_addr == ip->ip_dst.s_addr &&
sc->sc_proto == proto &&
(sc->sc_if.if_flags & IFF_UP) != 0)
return sc;
}
return NULL;
}
#endif /* if NGRE > 0 */

View File

@ -1,47 +0,0 @@
/* $NetBSD: ip_gre.h,v 1.8 2005/12/10 23:36:23 elad Exp $ */
/*
* Copyright (c) 1998 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Heiko W.Rupp <hwr@pilhuhn.de>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
* This product includes software developed by the NetBSD
* Foundation, Inc. and its contributors.
* 4. Neither the name of The NetBSD Foundation nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
#ifndef _NETINET_IP_GRE_H_
#define _NETINET_IP_GRE_H_
#ifdef _KERNEL
void gre_input(struct mbuf *, ...);
void gre_mobile_input(struct mbuf *, ...);
#endif /* _KERNEL */
#endif /* !_NETINET_IP_GRE_H_ */