NetBSD/sys/net/if_gre.c
2010-06-26 14:24:27 +00:00

1564 lines
35 KiB
C

/* $NetBSD: if_gre.c,v 1.144 2010/06/26 14:24:28 kefren Exp $ */
/*
* Copyright (c) 1998, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Heiko W.Rupp <hwr@pilhuhn.de>
*
* IPv6-over-GRE contributed by Gert Doering <gert@greenie.muc.de>
*
* GRE over UDP/IPv4/IPv6 sockets contributed by David Young <dyoung@NetBSD.org>
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
* This material is based upon work partially supported by NSF
* under Contract No. NSF CNS-0626584.
*/
/*
* Encapsulate L3 protocols into IP
* See RFC 1701 and 1702 for more details.
* If_gre is compatible with Cisco GRE tunnels, so you can
* have a NetBSD box as the other end of a tunnel interface of a Cisco
* router. See gre(4) for more details.
*/
#include <sys/cdefs.h>
__KERNEL_RCSID(0, "$NetBSD: if_gre.c,v 1.144 2010/06/26 14:24:28 kefren Exp $");
#include "opt_atalk.h"
#include "opt_gre.h"
#include "opt_inet.h"
#include "opt_mpls.h"
#include <sys/param.h>
#include <sys/file.h>
#include <sys/filedesc.h>
#include <sys/malloc.h>
#include <sys/mallocvar.h>
#include <sys/mbuf.h>
#include <sys/proc.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/ioctl.h>
#include <sys/queue.h>
#include <sys/intr.h>
#include <sys/systm.h>
#include <sys/sysctl.h>
#include <sys/kauth.h>
#include <sys/kernel.h>
#include <sys/mutex.h>
#include <sys/condvar.h>
#include <sys/kthread.h>
#include <sys/cpu.h>
#include <net/ethertypes.h>
#include <net/if.h>
#include <net/if_types.h>
#include <net/netisr.h>
#include <net/route.h>
#include <netinet/in_systm.h>
#include <netinet/in.h>
#include <netinet/ip.h> /* we always need this for sizeof(struct ip) */
#ifdef INET
#include <netinet/in_var.h>
#include <netinet/ip_var.h>
#endif
#ifdef INET6
#include <netinet6/in6_var.h>
#endif
#ifdef MPLS
#include <netmpls/mpls.h>
#include <netmpls/mpls_var.h>
#endif
#ifdef NETATALK
#include <netatalk/at.h>
#include <netatalk/at_var.h>
#include <netatalk/at_extern.h>
#endif
#include <sys/time.h>
#include <net/bpf.h>
#include <net/if_gre.h>
#include <compat/sys/socket.h>
#include <compat/sys/sockio.h>
/*
* It is not easy to calculate the right value for a GRE MTU.
* We leave this task to the admin and use the same default that
* other vendors use.
*/
#define GREMTU 1476
#ifdef GRE_DEBUG
int gre_debug = 0;
#define GRE_DPRINTF(__sc, ...) \
do { \
if (__predict_false(gre_debug || \
((__sc)->sc_if.if_flags & IFF_DEBUG) != 0)) { \
printf("%s.%d: ", __func__, __LINE__); \
printf(__VA_ARGS__); \
} \
} while (/*CONSTCOND*/0)
#else
#define GRE_DPRINTF(__sc, __fmt, ...) do { } while (/*CONSTCOND*/0)
#endif /* GRE_DEBUG */
int ip_gre_ttl = GRE_TTL;
MALLOC_DEFINE(M_GRE_BUFQ, "gre_bufq", "gre mbuf queue");
static int gre_clone_create(struct if_clone *, int);
static int gre_clone_destroy(struct ifnet *);
static struct if_clone gre_cloner =
IF_CLONE_INITIALIZER("gre", gre_clone_create, gre_clone_destroy);
static int gre_input(struct gre_softc *, struct mbuf *, int,
const struct gre_h *);
static bool gre_is_nullconf(const struct gre_soparm *);
static int gre_output(struct ifnet *, struct mbuf *,
const struct sockaddr *, struct rtentry *);
static int gre_ioctl(struct ifnet *, u_long, void *);
static int gre_getsockname(struct socket *, struct mbuf *, struct lwp *);
static int gre_getpeername(struct socket *, struct mbuf *, struct lwp *);
static int gre_getnames(struct socket *, struct lwp *,
struct sockaddr_storage *, struct sockaddr_storage *);
static void gre_clearconf(struct gre_soparm *, bool);
static int gre_soreceive(struct socket *, struct mbuf **);
static int gre_sosend(struct socket *, struct mbuf *);
static struct socket *gre_reconf(struct gre_softc *, const struct gre_soparm *);
static bool gre_fp_send(struct gre_softc *, enum gre_msg, file_t *);
static bool gre_fp_recv(struct gre_softc *);
static void gre_fp_recvloop(void *);
static int
nearest_pow2(size_t len0)
{
size_t len, mid;
if (len0 == 0)
return 1;
for (len = len0; (len & (len - 1)) != 0; len &= len - 1)
;
mid = len | (len >> 1);
/* avoid overflow */
if ((len << 1) < len)
return len;
if (len0 >= mid)
return len << 1;
return len;
}
static struct gre_bufq *
gre_bufq_init(struct gre_bufq *bq, size_t len0)
{
size_t len;
len = nearest_pow2(len0);
memset(bq, 0, sizeof(*bq));
bq->bq_buf = malloc(len * sizeof(struct mbuf *), M_GRE_BUFQ, M_WAITOK);
bq->bq_len = len;
bq->bq_lenmask = len - 1;
return bq;
}
static bool
gre_bufq_empty(struct gre_bufq *bq)
{
return bq->bq_prodidx == bq->bq_considx;
}
static struct mbuf *
gre_bufq_dequeue(struct gre_bufq *bq)
{
struct mbuf *m;
if (gre_bufq_empty(bq))
return NULL;
m = bq->bq_buf[bq->bq_considx];
bq->bq_considx = (bq->bq_considx + 1) & bq->bq_lenmask;
return m;
}
static void
gre_bufq_purge(struct gre_bufq *bq)
{
struct mbuf *m;
while ((m = gre_bufq_dequeue(bq)) != NULL)
m_freem(m);
}
static int
gre_bufq_enqueue(struct gre_bufq *bq, struct mbuf *m)
{
int next;
next = (bq->bq_prodidx + 1) & bq->bq_lenmask;
if (next == bq->bq_considx) {
bq->bq_drops++;
return ENOBUFS;
}
bq->bq_buf[bq->bq_prodidx] = m;
bq->bq_prodidx = next;
return 0;
}
static void
greintr(void *arg)
{
struct gre_softc *sc = (struct gre_softc *)arg;
struct socket *so = sc->sc_soparm.sp_so;
int rc;
struct mbuf *m;
KASSERT(so != NULL);
sc->sc_send_ev.ev_count++;
GRE_DPRINTF(sc, "enter\n");
while ((m = gre_bufq_dequeue(&sc->sc_snd)) != NULL) {
/* XXX handle ENOBUFS? */
if ((rc = gre_sosend(so, m)) != 0)
GRE_DPRINTF(sc, "gre_sosend failed %d\n", rc);
}
}
/* Caller must hold sc->sc_mtx. */
static void
gre_wait(struct gre_softc *sc)
{
sc->sc_waiters++;
cv_wait(&sc->sc_condvar, &sc->sc_mtx);
sc->sc_waiters--;
}
static void
gre_fp_wait(struct gre_softc *sc)
{
sc->sc_fp_waiters++;
cv_wait(&sc->sc_fp_condvar, &sc->sc_mtx);
sc->sc_fp_waiters--;
}
static void
gre_evcnt_detach(struct gre_softc *sc)
{
evcnt_detach(&sc->sc_unsupp_ev);
evcnt_detach(&sc->sc_pullup_ev);
evcnt_detach(&sc->sc_error_ev);
evcnt_detach(&sc->sc_block_ev);
evcnt_detach(&sc->sc_recv_ev);
evcnt_detach(&sc->sc_oflow_ev);
evcnt_detach(&sc->sc_send_ev);
}
static void
gre_evcnt_attach(struct gre_softc *sc)
{
evcnt_attach_dynamic(&sc->sc_recv_ev, EVCNT_TYPE_MISC,
NULL, sc->sc_if.if_xname, "recv");
evcnt_attach_dynamic(&sc->sc_block_ev, EVCNT_TYPE_MISC,
&sc->sc_recv_ev, sc->sc_if.if_xname, "would block");
evcnt_attach_dynamic(&sc->sc_error_ev, EVCNT_TYPE_MISC,
&sc->sc_recv_ev, sc->sc_if.if_xname, "error");
evcnt_attach_dynamic(&sc->sc_pullup_ev, EVCNT_TYPE_MISC,
&sc->sc_recv_ev, sc->sc_if.if_xname, "pullup failed");
evcnt_attach_dynamic(&sc->sc_unsupp_ev, EVCNT_TYPE_MISC,
&sc->sc_recv_ev, sc->sc_if.if_xname, "unsupported");
evcnt_attach_dynamic(&sc->sc_send_ev, EVCNT_TYPE_MISC,
NULL, sc->sc_if.if_xname, "send");
evcnt_attach_dynamic(&sc->sc_oflow_ev, EVCNT_TYPE_MISC,
&sc->sc_send_ev, sc->sc_if.if_xname, "overflow");
}
static int
gre_clone_create(struct if_clone *ifc, int unit)
{
int rc;
struct gre_softc *sc;
struct gre_soparm *sp;
const struct sockaddr *any;
if ((any = sockaddr_any_by_family(AF_INET)) == NULL &&
(any = sockaddr_any_by_family(AF_INET6)) == NULL)
return -1;
sc = malloc(sizeof(struct gre_softc), M_DEVBUF, M_WAITOK|M_ZERO);
mutex_init(&sc->sc_mtx, MUTEX_DRIVER, IPL_SOFTNET);
cv_init(&sc->sc_condvar, "gre wait");
cv_init(&sc->sc_fp_condvar, "gre fp");
if_initname(&sc->sc_if, ifc->ifc_name, unit);
sc->sc_if.if_softc = sc;
sc->sc_if.if_type = IFT_TUNNEL;
sc->sc_if.if_addrlen = 0;
sc->sc_if.if_hdrlen = sizeof(struct ip) + sizeof(struct gre_h);
sc->sc_if.if_dlt = DLT_NULL;
sc->sc_if.if_mtu = GREMTU;
sc->sc_if.if_flags = IFF_POINTOPOINT|IFF_MULTICAST;
sc->sc_if.if_output = gre_output;
sc->sc_if.if_ioctl = gre_ioctl;
sp = &sc->sc_soparm;
sockaddr_copy(sstosa(&sp->sp_dst), sizeof(sp->sp_dst), any);
sockaddr_copy(sstosa(&sp->sp_src), sizeof(sp->sp_src), any);
sp->sp_proto = IPPROTO_GRE;
sp->sp_type = SOCK_RAW;
sc->sc_fd = -1;
rc = kthread_create(PRI_NONE, KTHREAD_MPSAFE, NULL, gre_fp_recvloop, sc,
NULL, sc->sc_if.if_xname);
if (rc != 0)
return -1;
gre_evcnt_attach(sc);
gre_bufq_init(&sc->sc_snd, 17);
sc->sc_if.if_flags |= IFF_LINK0;
if_attach(&sc->sc_if);
if_alloc_sadl(&sc->sc_if);
bpf_attach(&sc->sc_if, DLT_NULL, sizeof(uint32_t));
sc->sc_state = GRE_S_IDLE;
return 0;
}
static int
gre_clone_destroy(struct ifnet *ifp)
{
int s;
struct gre_softc *sc = ifp->if_softc;
GRE_DPRINTF(sc, "\n");
bpf_detach(ifp);
s = splnet();
if_detach(ifp);
/* Some LWPs may still wait in gre_ioctl_lock(), however,
* no new LWP will enter gre_ioctl_lock(), because ifunit()
* cannot locate the interface any longer.
*/
mutex_enter(&sc->sc_mtx);
GRE_DPRINTF(sc, "\n");
while (sc->sc_state != GRE_S_IDLE)
gre_wait(sc);
GRE_DPRINTF(sc, "\n");
sc->sc_state = GRE_S_DIE;
cv_broadcast(&sc->sc_condvar);
while (sc->sc_waiters > 0)
cv_wait(&sc->sc_condvar, &sc->sc_mtx);
/* At this point, no other LWP will access the gre_softc, so
* we can release the mutex.
*/
mutex_exit(&sc->sc_mtx);
GRE_DPRINTF(sc, "\n");
/* Note that we must not hold the mutex while we call gre_reconf(). */
gre_reconf(sc, NULL);
mutex_enter(&sc->sc_mtx);
sc->sc_msg = GRE_M_STOP;
cv_signal(&sc->sc_fp_condvar);
while (sc->sc_fp_waiters > 0)
cv_wait(&sc->sc_fp_condvar, &sc->sc_mtx);
mutex_exit(&sc->sc_mtx);
splx(s);
cv_destroy(&sc->sc_condvar);
cv_destroy(&sc->sc_fp_condvar);
mutex_destroy(&sc->sc_mtx);
gre_evcnt_detach(sc);
free(sc, M_DEVBUF);
return 0;
}
static void
gre_receive(struct socket *so, void *arg, int events, int waitflag)
{
struct gre_softc *sc = (struct gre_softc *)arg;
int rc;
const struct gre_h *gh;
struct mbuf *m;
GRE_DPRINTF(sc, "enter\n");
sc->sc_recv_ev.ev_count++;
rc = gre_soreceive(so, &m);
/* TBD Back off if ECONNREFUSED (indicates
* ICMP Port Unreachable)?
*/
if (rc == EWOULDBLOCK) {
GRE_DPRINTF(sc, "EWOULDBLOCK\n");
sc->sc_block_ev.ev_count++;
return;
} else if (rc != 0 || m == NULL) {
GRE_DPRINTF(sc, "%s: rc %d m %p\n",
sc->sc_if.if_xname, rc, (void *)m);
sc->sc_error_ev.ev_count++;
return;
}
if (m->m_len < sizeof(*gh) && (m = m_pullup(m, sizeof(*gh))) == NULL) {
GRE_DPRINTF(sc, "m_pullup failed\n");
sc->sc_pullup_ev.ev_count++;
return;
}
gh = mtod(m, const struct gre_h *);
if (gre_input(sc, m, 0, gh) == 0) {
sc->sc_unsupp_ev.ev_count++;
GRE_DPRINTF(sc, "dropping unsupported\n");
m_freem(m);
}
}
static void
gre_upcall_add(struct socket *so, void *arg)
{
/* XXX What if the kernel already set an upcall? */
KASSERT((so->so_rcv.sb_flags & SB_UPCALL) == 0);
so->so_upcallarg = arg;
so->so_upcall = gre_receive;
so->so_rcv.sb_flags |= SB_UPCALL;
}
static void
gre_upcall_remove(struct socket *so)
{
so->so_rcv.sb_flags &= ~SB_UPCALL;
so->so_upcallarg = NULL;
so->so_upcall = NULL;
}
static int
gre_socreate(struct gre_softc *sc, const struct gre_soparm *sp, int *fdout)
{
const struct protosw *pr;
int fd, rc;
struct mbuf *m;
struct sockaddr *sa;
struct socket *so;
sa_family_t af;
int val;
GRE_DPRINTF(sc, "enter\n");
af = sp->sp_src.ss_family;
rc = fsocreate(af, NULL, sp->sp_type, sp->sp_proto, curlwp, &fd);
if (rc != 0) {
GRE_DPRINTF(sc, "fsocreate failed\n");
return rc;
}
if ((rc = fd_getsock(fd, &so)) != 0)
return rc;
if ((m = getsombuf(so, MT_SONAME)) == NULL) {
rc = ENOBUFS;
goto out;
}
sa = mtod(m, struct sockaddr *);
sockaddr_copy(sa, MIN(MLEN, sizeof(sp->sp_src)), sstocsa(&sp->sp_src));
m->m_len = sp->sp_src.ss_len;
if ((rc = sobind(so, m, curlwp)) != 0) {
GRE_DPRINTF(sc, "sobind failed\n");
goto out;
}
sockaddr_copy(sa, MIN(MLEN, sizeof(sp->sp_dst)), sstocsa(&sp->sp_dst));
m->m_len = sp->sp_dst.ss_len;
solock(so);
if ((rc = soconnect(so, m, curlwp)) != 0) {
GRE_DPRINTF(sc, "soconnect failed\n");
sounlock(so);
goto out;
}
sounlock(so);
m = NULL;
/* XXX convert to a (new) SOL_SOCKET call */
pr = so->so_proto;
KASSERT(pr != NULL);
rc = so_setsockopt(curlwp, so, IPPROTO_IP, IP_TTL,
&ip_gre_ttl, sizeof(ip_gre_ttl));
if (rc != 0) {
GRE_DPRINTF(sc, "so_setsockopt ttl failed\n");
rc = 0;
}
val = 1;
rc = so_setsockopt(curlwp, so, SOL_SOCKET, SO_NOHEADER,
&val, sizeof(val));
if (rc != 0) {
GRE_DPRINTF(sc, "so_setsockopt SO_NOHEADER failed\n");
rc = 0;
}
out:
m_freem(m);
if (rc != 0)
fd_close(fd);
else {
fd_putfile(fd);
*fdout = fd;
}
return rc;
}
static int
gre_sosend(struct socket *so, struct mbuf *top)
{
struct mbuf **mp;
struct proc *p;
long space, resid;
int error;
struct lwp * const l = curlwp;
p = l->l_proc;
resid = top->m_pkthdr.len;
if (p)
l->l_ru.ru_msgsnd++;
#define snderr(errno) { error = errno; goto release; }
solock(so);
if ((error = sblock(&so->so_snd, M_NOWAIT)) != 0)
goto out;
if (so->so_state & SS_CANTSENDMORE)
snderr(EPIPE);
if (so->so_error) {
error = so->so_error;
so->so_error = 0;
goto release;
}
if ((so->so_state & SS_ISCONNECTED) == 0) {
if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
if ((so->so_state & SS_ISCONFIRMING) == 0)
snderr(ENOTCONN);
} else
snderr(EDESTADDRREQ);
}
space = sbspace(&so->so_snd);
if (resid > so->so_snd.sb_hiwat)
snderr(EMSGSIZE);
if (space < resid)
snderr(EWOULDBLOCK);
mp = &top;
/*
* Data is prepackaged in "top".
*/
if (so->so_state & SS_CANTSENDMORE)
snderr(EPIPE);
error = (*so->so_proto->pr_usrreq)(so, PRU_SEND, top, NULL, NULL, l);
top = NULL;
mp = &top;
release:
sbunlock(&so->so_snd);
out:
sounlock(so);
if (top != NULL)
m_freem(top);
return error;
}
/* This is a stripped-down version of soreceive() that will never
* block. It will support SOCK_DGRAM sockets. It may also support
* SOCK_SEQPACKET sockets.
*/
static int
gre_soreceive(struct socket *so, struct mbuf **mp0)
{
struct mbuf *m, **mp;
int flags, len, error, type;
const struct protosw *pr;
struct mbuf *nextrecord;
KASSERT(mp0 != NULL);
flags = MSG_DONTWAIT;
pr = so->so_proto;
mp = mp0;
type = 0;
*mp = NULL;
KASSERT(pr->pr_flags & PR_ATOMIC);
if (so->so_state & SS_ISCONFIRMING)
(*pr->pr_usrreq)(so, PRU_RCVD, NULL, NULL, NULL, curlwp);
restart:
if ((error = sblock(&so->so_rcv, M_NOWAIT)) != 0) {
return error;
}
m = so->so_rcv.sb_mb;
/*
* If we have less data than requested, do not block awaiting more.
*/
if (m == NULL) {
#ifdef DIAGNOSTIC
if (so->so_rcv.sb_cc)
panic("receive 1");
#endif
if (so->so_error) {
error = so->so_error;
so->so_error = 0;
} else if (so->so_state & SS_CANTRCVMORE)
;
else if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0
&& (so->so_proto->pr_flags & PR_CONNREQUIRED))
error = ENOTCONN;
else
error = EWOULDBLOCK;
goto release;
}
/*
* On entry here, m points to the first record of the socket buffer.
* While we process the initial mbufs containing address and control
* info, we save a copy of m->m_nextpkt into nextrecord.
*/
if (curlwp != NULL)
curlwp->l_ru.ru_msgrcv++;
KASSERT(m == so->so_rcv.sb_mb);
SBLASTRECORDCHK(&so->so_rcv, "soreceive 1");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 1");
nextrecord = m->m_nextpkt;
if (pr->pr_flags & PR_ADDR) {
#ifdef DIAGNOSTIC
if (m->m_type != MT_SONAME)
panic("receive 1a");
#endif
sbfree(&so->so_rcv, m);
MFREE(m, so->so_rcv.sb_mb);
m = so->so_rcv.sb_mb;
}
while (m != NULL && m->m_type == MT_CONTROL && error == 0) {
sbfree(&so->so_rcv, m);
/*
* Dispose of any SCM_RIGHTS message that went
* through the read path rather than recv.
*/
if (pr->pr_domain->dom_dispose &&
mtod(m, struct cmsghdr *)->cmsg_type == SCM_RIGHTS)
(*pr->pr_domain->dom_dispose)(m);
MFREE(m, so->so_rcv.sb_mb);
m = so->so_rcv.sb_mb;
}
/*
* If m is non-NULL, we have some data to read. From now on,
* make sure to keep sb_lastrecord consistent when working on
* the last packet on the chain (nextrecord == NULL) and we
* change m->m_nextpkt.
*/
if (m != NULL) {
m->m_nextpkt = nextrecord;
/*
* If nextrecord == NULL (this is a single chain),
* then sb_lastrecord may not be valid here if m
* was changed earlier.
*/
if (nextrecord == NULL) {
KASSERT(so->so_rcv.sb_mb == m);
so->so_rcv.sb_lastrecord = m;
}
type = m->m_type;
if (type == MT_OOBDATA)
flags |= MSG_OOB;
} else {
KASSERT(so->so_rcv.sb_mb == m);
so->so_rcv.sb_mb = nextrecord;
SB_EMPTY_FIXUP(&so->so_rcv);
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive 2");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 2");
while (m != NULL) {
if (m->m_type == MT_OOBDATA) {
if (type != MT_OOBDATA)
break;
} else if (type == MT_OOBDATA)
break;
#ifdef DIAGNOSTIC
else if (m->m_type != MT_DATA && m->m_type != MT_HEADER)
panic("receive 3");
#endif
so->so_state &= ~SS_RCVATMARK;
if (so->so_oobmark != 0 && so->so_oobmark < m->m_len)
break;
len = m->m_len;
/*
* mp is set, just pass back the mbufs.
* Sockbuf must be consistent here (points to current mbuf,
* it points to next record) when we drop priority;
* we must note any additions to the sockbuf when we
* block interrupts again.
*/
if (m->m_flags & M_EOR)
flags |= MSG_EOR;
nextrecord = m->m_nextpkt;
sbfree(&so->so_rcv, m);
*mp = m;
mp = &m->m_next;
so->so_rcv.sb_mb = m = m->m_next;
*mp = NULL;
/*
* If m != NULL, we also know that
* so->so_rcv.sb_mb != NULL.
*/
KASSERT(so->so_rcv.sb_mb == m);
if (m) {
m->m_nextpkt = nextrecord;
if (nextrecord == NULL)
so->so_rcv.sb_lastrecord = m;
} else {
so->so_rcv.sb_mb = nextrecord;
SB_EMPTY_FIXUP(&so->so_rcv);
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive 3");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 3");
if (so->so_oobmark) {
so->so_oobmark -= len;
if (so->so_oobmark == 0) {
so->so_state |= SS_RCVATMARK;
break;
}
}
if (flags & MSG_EOR)
break;
}
if (m != NULL) {
m_freem(*mp);
*mp = NULL;
error = ENOMEM;
(void) sbdroprecord(&so->so_rcv);
} else {
/*
* First part is an inline SB_EMPTY_FIXUP(). Second
* part makes sure sb_lastrecord is up-to-date if
* there is still data in the socket buffer.
*/
so->so_rcv.sb_mb = nextrecord;
if (so->so_rcv.sb_mb == NULL) {
so->so_rcv.sb_mbtail = NULL;
so->so_rcv.sb_lastrecord = NULL;
} else if (nextrecord->m_nextpkt == NULL)
so->so_rcv.sb_lastrecord = nextrecord;
}
SBLASTRECORDCHK(&so->so_rcv, "soreceive 4");
SBLASTMBUFCHK(&so->so_rcv, "soreceive 4");
if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
(*pr->pr_usrreq)(so, PRU_RCVD, NULL,
(struct mbuf *)(long)flags, NULL, curlwp);
if (*mp0 == NULL && (flags & MSG_EOR) == 0 &&
(so->so_state & SS_CANTRCVMORE) == 0) {
sbunlock(&so->so_rcv);
goto restart;
}
release:
sbunlock(&so->so_rcv);
return error;
}
static struct socket *
gre_reconf(struct gre_softc *sc, const struct gre_soparm *newsoparm)
{
struct ifnet *ifp = &sc->sc_if;
GRE_DPRINTF(sc, "enter\n");
shutdown:
if (sc->sc_soparm.sp_so != NULL) {
GRE_DPRINTF(sc, "\n");
gre_upcall_remove(sc->sc_soparm.sp_so);
softint_disestablish(sc->sc_si);
sc->sc_si = NULL;
gre_fp_send(sc, GRE_M_DELFP, NULL);
gre_clearconf(&sc->sc_soparm, false);
}
if (newsoparm != NULL) {
GRE_DPRINTF(sc, "\n");
sc->sc_soparm = *newsoparm;
newsoparm = NULL;
}
if (sc->sc_soparm.sp_so != NULL) {
GRE_DPRINTF(sc, "\n");
sc->sc_si = softint_establish(SOFTINT_NET, greintr, sc);
gre_upcall_add(sc->sc_soparm.sp_so, sc);
if ((ifp->if_flags & IFF_UP) == 0) {
GRE_DPRINTF(sc, "down\n");
goto shutdown;
}
}
GRE_DPRINTF(sc, "\n");
if (sc->sc_soparm.sp_so != NULL)
sc->sc_if.if_flags |= IFF_RUNNING;
else {
gre_bufq_purge(&sc->sc_snd);
sc->sc_if.if_flags &= ~IFF_RUNNING;
}
return sc->sc_soparm.sp_so;
}
static int
gre_input(struct gre_softc *sc, struct mbuf *m, int hlen,
const struct gre_h *gh)
{
uint16_t flags;
uint32_t af; /* af passed to BPF tap */
int isr, s;
struct ifqueue *ifq;
sc->sc_if.if_ipackets++;
sc->sc_if.if_ibytes += m->m_pkthdr.len;
hlen += sizeof(struct gre_h);
/* process GRE flags as packet can be of variable len */
flags = ntohs(gh->flags);
/* Checksum & Offset are present */
if ((flags & GRE_CP) | (flags & GRE_RP))
hlen += 4;
/* We don't support routing fields (variable length) */
if (flags & GRE_RP) {
sc->sc_if.if_ierrors++;
return 0;
}
if (flags & GRE_KP)
hlen += 4;
if (flags & GRE_SP)
hlen += 4;
switch (ntohs(gh->ptype)) { /* ethertypes */
#ifdef INET
case ETHERTYPE_IP:
ifq = &ipintrq;
isr = NETISR_IP;
af = AF_INET;
break;
#endif
#ifdef NETATALK
case ETHERTYPE_ATALK:
ifq = &atintrq1;
isr = NETISR_ATALK;
af = AF_APPLETALK;
break;
#endif
#ifdef INET6
case ETHERTYPE_IPV6:
ifq = &ip6intrq;
isr = NETISR_IPV6;
af = AF_INET6;
break;
#endif
#ifdef MPLS
case ETHERTYPE_MPLS:
ifq = &mplsintrq;
isr = NETISR_MPLS;
af = AF_MPLS;
break;
#endif
default: /* others not yet supported */
GRE_DPRINTF(sc, "unhandled ethertype 0x%04x\n",
ntohs(gh->ptype));
sc->sc_if.if_noproto++;
return 0;
}
if (hlen > m->m_pkthdr.len) {
m_freem(m);
sc->sc_if.if_ierrors++;
return EINVAL;
}
m_adj(m, hlen);
bpf_mtap_af(&sc->sc_if, af, m);
m->m_pkthdr.rcvif = &sc->sc_if;
s = splnet();
if (IF_QFULL(ifq)) {
IF_DROP(ifq);
m_freem(m);
} else {
IF_ENQUEUE(ifq, m);
}
/* we need schednetisr since the address family may change */
schednetisr(isr);
splx(s);
return 1; /* packet is done, no further processing needed */
}
/*
* The output routine. Takes a packet and encapsulates it in the protocol
* given by sc->sc_soparm.sp_proto. See also RFC 1701 and RFC 2004
*/
static int
gre_output(struct ifnet *ifp, struct mbuf *m, const struct sockaddr *dst,
struct rtentry *rt)
{
int error = 0;
struct gre_softc *sc = ifp->if_softc;
struct gre_h *gh;
uint16_t etype = 0;
if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) != (IFF_UP|IFF_RUNNING)) {
m_freem(m);
error = ENETDOWN;
goto end;
}
bpf_mtap_af(ifp, dst->sa_family, m);
m->m_flags &= ~(M_BCAST|M_MCAST);
GRE_DPRINTF(sc, "dst->sa_family=%d\n", dst->sa_family);
switch (dst->sa_family) {
#ifdef INET
case AF_INET:
/* TBD Extract the IP ToS field and set the
* encapsulating protocol's ToS to suit.
*/
etype = htons(ETHERTYPE_IP);
break;
#endif
#ifdef NETATALK
case AF_APPLETALK:
etype = htons(ETHERTYPE_ATALK);
break;
#endif
#ifdef INET6
case AF_INET6:
etype = htons(ETHERTYPE_IPV6);
break;
#endif
default:
IF_DROP(&ifp->if_snd);
m_freem(m);
error = EAFNOSUPPORT;
goto end;
}
#ifdef MPLS
if (rt != NULL && rt_gettag(rt) != NULL) {
union mpls_shim msh;
msh.s_addr = MPLS_GETSADDR(rt);
if (msh.shim.label != MPLS_LABEL_IMPLNULL)
etype = htons(ETHERTYPE_MPLS);
}
#endif
M_PREPEND(m, sizeof(*gh), M_DONTWAIT);
if (m == NULL) {
IF_DROP(&ifp->if_snd);
error = ENOBUFS;
goto end;
}
gh = mtod(m, struct gre_h *);
gh->flags = 0;
gh->ptype = etype;
/* XXX Need to handle IP ToS. Look at how I handle IP TTL. */
ifp->if_opackets++;
ifp->if_obytes += m->m_pkthdr.len;
/* send it off */
if ((error = gre_bufq_enqueue(&sc->sc_snd, m)) != 0) {
sc->sc_oflow_ev.ev_count++;
m_freem(m);
} else
softint_schedule(sc->sc_si);
end:
if (error)
ifp->if_oerrors++;
return error;
}
static int
gre_getname(struct socket *so, int req, struct mbuf *nam, struct lwp *l)
{
return (*so->so_proto->pr_usrreq)(so, req, NULL, nam, NULL, l);
}
static int
gre_getsockname(struct socket *so, struct mbuf *nam, struct lwp *l)
{
return gre_getname(so, PRU_SOCKADDR, nam, l);
}
static int
gre_getpeername(struct socket *so, struct mbuf *nam, struct lwp *l)
{
return gre_getname(so, PRU_PEERADDR, nam, l);
}
static int
gre_getnames(struct socket *so, struct lwp *l, struct sockaddr_storage *src,
struct sockaddr_storage *dst)
{
struct mbuf *m;
struct sockaddr_storage *ss;
int rc;
if ((m = getsombuf(so, MT_SONAME)) == NULL)
return ENOBUFS;
ss = mtod(m, struct sockaddr_storage *);
solock(so);
if ((rc = gre_getsockname(so, m, l)) != 0)
goto out;
*src = *ss;
if ((rc = gre_getpeername(so, m, l)) != 0)
goto out;
*dst = *ss;
out:
sounlock(so);
m_freem(m);
return rc;
}
static void
gre_fp_recvloop(void *arg)
{
struct gre_softc *sc = arg;
mutex_enter(&sc->sc_mtx);
while (gre_fp_recv(sc))
;
mutex_exit(&sc->sc_mtx);
kthread_exit(0);
}
static bool
gre_fp_recv(struct gre_softc *sc)
{
int fd, ofd, rc;
file_t *fp;
fp = sc->sc_fp;
ofd = sc->sc_fd;
fd = -1;
switch (sc->sc_msg) {
case GRE_M_STOP:
cv_signal(&sc->sc_fp_condvar);
return false;
case GRE_M_SETFP:
mutex_exit(&sc->sc_mtx);
rc = fd_dup(fp, 0, &fd, 0);
mutex_enter(&sc->sc_mtx);
if (rc != 0) {
sc->sc_msg = GRE_M_ERR;
break;
}
/*FALLTHROUGH*/
case GRE_M_DELFP:
mutex_exit(&sc->sc_mtx);
if (ofd != -1 && fd_getfile(ofd) != NULL)
fd_close(ofd);
mutex_enter(&sc->sc_mtx);
sc->sc_fd = fd;
sc->sc_msg = GRE_M_OK;
break;
default:
gre_fp_wait(sc);
return true;
}
cv_signal(&sc->sc_fp_condvar);
return true;
}
static bool
gre_fp_send(struct gre_softc *sc, enum gre_msg msg, file_t *fp)
{
bool rc;
mutex_enter(&sc->sc_mtx);
while (sc->sc_msg != GRE_M_NONE)
gre_fp_wait(sc);
sc->sc_fp = fp;
sc->sc_msg = msg;
cv_signal(&sc->sc_fp_condvar);
while (sc->sc_msg != GRE_M_STOP && sc->sc_msg != GRE_M_OK &&
sc->sc_msg != GRE_M_ERR)
gre_fp_wait(sc);
rc = (sc->sc_msg != GRE_M_ERR);
sc->sc_msg = GRE_M_NONE;
cv_signal(&sc->sc_fp_condvar);
mutex_exit(&sc->sc_mtx);
return rc;
}
static int
gre_ssock(struct ifnet *ifp, struct gre_soparm *sp, int fd)
{
int error = 0;
const struct protosw *pr;
file_t *fp;
struct gre_softc *sc = ifp->if_softc;
struct socket *so;
struct sockaddr_storage dst, src;
if ((fp = fd_getfile(fd)) == NULL)
return EBADF;
if (fp->f_type != DTYPE_SOCKET) {
fd_putfile(fd);
return ENOTSOCK;
}
GRE_DPRINTF(sc, "\n");
so = (struct socket *)fp->f_data;
pr = so->so_proto;
GRE_DPRINTF(sc, "type %d, proto %d\n", pr->pr_type, pr->pr_protocol);
if ((pr->pr_flags & PR_ATOMIC) == 0 ||
(sp->sp_type != 0 && pr->pr_type != sp->sp_type) ||
(sp->sp_proto != 0 && pr->pr_protocol != 0 &&
pr->pr_protocol != sp->sp_proto)) {
error = EINVAL;
goto err;
}
GRE_DPRINTF(sc, "\n");
/* check address */
if ((error = gre_getnames(so, curlwp, &src, &dst)) != 0)
goto err;
GRE_DPRINTF(sc, "\n");
if (!gre_fp_send(sc, GRE_M_SETFP, fp)) {
error = EBUSY;
goto err;
}
GRE_DPRINTF(sc, "\n");
sp->sp_src = src;
sp->sp_dst = dst;
sp->sp_so = so;
err:
fd_putfile(fd);
return error;
}
static bool
sockaddr_is_anyaddr(const struct sockaddr *sa)
{
socklen_t anylen, salen;
const void *anyaddr, *addr;
if ((anyaddr = sockaddr_anyaddr(sa, &anylen)) == NULL ||
(addr = sockaddr_const_addr(sa, &salen)) == NULL)
return false;
if (salen > anylen)
return false;
return memcmp(anyaddr, addr, MIN(anylen, salen)) == 0;
}
static bool
gre_is_nullconf(const struct gre_soparm *sp)
{
return sockaddr_is_anyaddr(sstocsa(&sp->sp_src)) ||
sockaddr_is_anyaddr(sstocsa(&sp->sp_dst));
}
static void
gre_clearconf(struct gre_soparm *sp, bool force)
{
if (sp->sp_bysock || force) {
sockaddr_copy(sstosa(&sp->sp_src), sizeof(sp->sp_src),
sockaddr_any(sstosa(&sp->sp_src)));
sockaddr_copy(sstosa(&sp->sp_dst), sizeof(sp->sp_dst),
sockaddr_any(sstosa(&sp->sp_dst)));
sp->sp_bysock = false;
}
sp->sp_so = NULL; /* XXX */
}
static int
gre_ioctl_lock(struct gre_softc *sc)
{
mutex_enter(&sc->sc_mtx);
while (sc->sc_state == GRE_S_IOCTL)
gre_wait(sc);
if (sc->sc_state != GRE_S_IDLE) {
cv_signal(&sc->sc_condvar);
mutex_exit(&sc->sc_mtx);
GRE_DPRINTF(sc, "\n");
return ENXIO;
}
sc->sc_state = GRE_S_IOCTL;
mutex_exit(&sc->sc_mtx);
return 0;
}
static void
gre_ioctl_unlock(struct gre_softc *sc)
{
mutex_enter(&sc->sc_mtx);
KASSERT(sc->sc_state == GRE_S_IOCTL);
sc->sc_state = GRE_S_IDLE;
cv_signal(&sc->sc_condvar);
mutex_exit(&sc->sc_mtx);
}
static int
gre_ioctl(struct ifnet *ifp, const u_long cmd, void *data)
{
struct ifreq *ifr;
struct if_laddrreq *lifr = (struct if_laddrreq *)data;
struct gre_softc *sc = ifp->if_softc;
struct gre_soparm *sp;
int fd, error = 0, oproto, otype, s;
struct gre_soparm sp0;
ifr = data;
GRE_DPRINTF(sc, "cmd %lu\n", cmd);
switch (cmd) {
case SIOCSIFFLAGS:
case SIOCSIFMTU:
case GRESPROTO:
case GRESADDRD:
case GRESADDRS:
case GRESSOCK:
case GREDSOCK:
case SIOCSLIFPHYADDR:
case SIOCDIFPHYADDR:
if (kauth_authorize_network(curlwp->l_cred,
KAUTH_NETWORK_INTERFACE,
KAUTH_REQ_NETWORK_INTERFACE_SETPRIV, ifp, (void *)cmd,
NULL) != 0)
return EPERM;
break;
default:
break;
}
if ((error = gre_ioctl_lock(sc)) != 0) {
GRE_DPRINTF(sc, "\n");
return error;
}
s = splnet();
sp0 = sc->sc_soparm;
sp0.sp_so = NULL;
sp = &sp0;
GRE_DPRINTF(sc, "\n");
switch (cmd) {
case SIOCINITIFADDR:
GRE_DPRINTF(sc, "\n");
if ((ifp->if_flags & IFF_UP) != 0)
break;
gre_clearconf(sp, false);
ifp->if_flags |= IFF_UP;
goto mksocket;
case SIOCSIFDSTADDR:
break;
case SIOCSIFFLAGS:
if ((error = ifioctl_common(ifp, cmd, data)) != 0)
break;
oproto = sp->sp_proto;
otype = sp->sp_type;
switch (ifr->ifr_flags & (IFF_LINK0|IFF_LINK2)) {
case IFF_LINK0|IFF_LINK2:
sp->sp_proto = IPPROTO_UDP;
sp->sp_type = SOCK_DGRAM;
break;
case IFF_LINK2:
sp->sp_proto = 0;
sp->sp_type = 0;
break;
case IFF_LINK0:
sp->sp_proto = IPPROTO_GRE;
sp->sp_type = SOCK_RAW;
break;
default:
GRE_DPRINTF(sc, "\n");
error = EINVAL;
goto out;
}
GRE_DPRINTF(sc, "\n");
gre_clearconf(sp, false);
if ((ifp->if_flags & (IFF_UP|IFF_RUNNING)) ==
(IFF_UP|IFF_RUNNING) &&
(oproto == sp->sp_proto || sp->sp_proto == 0) &&
(otype == sp->sp_type || sp->sp_type == 0))
break;
switch (sp->sp_proto) {
case IPPROTO_UDP:
case IPPROTO_GRE:
goto mksocket;
default:
break;
}
break;
case SIOCSIFMTU:
/* XXX determine MTU automatically by probing w/
* XXX do-not-fragment packets?
*/
if (ifr->ifr_mtu < 576) {
error = EINVAL;
break;
}
/*FALLTHROUGH*/
case SIOCGIFMTU:
if ((error = ifioctl_common(ifp, cmd, data)) == ENETRESET)
error = 0;
break;
case SIOCADDMULTI:
case SIOCDELMULTI:
if (ifr == NULL) {
error = EAFNOSUPPORT;
break;
}
switch (ifreq_getaddr(cmd, ifr)->sa_family) {
#ifdef INET
case AF_INET:
break;
#endif
#ifdef INET6
case AF_INET6:
break;
#endif
default:
error = EAFNOSUPPORT;
break;
}
break;
case GRESPROTO:
gre_clearconf(sp, false);
oproto = sp->sp_proto;
otype = sp->sp_type;
sp->sp_proto = ifr->ifr_flags;
switch (sp->sp_proto) {
case IPPROTO_UDP:
ifp->if_flags |= IFF_LINK0|IFF_LINK2;
sp->sp_type = SOCK_DGRAM;
break;
case IPPROTO_GRE:
ifp->if_flags |= IFF_LINK0;
ifp->if_flags &= ~IFF_LINK2;
sp->sp_type = SOCK_RAW;
break;
case 0:
ifp->if_flags &= ~IFF_LINK0;
ifp->if_flags |= IFF_LINK2;
sp->sp_type = 0;
break;
default:
error = EPROTONOSUPPORT;
break;
}
if ((oproto == sp->sp_proto || sp->sp_proto == 0) &&
(otype == sp->sp_type || sp->sp_type == 0))
break;
switch (sp->sp_proto) {
case IPPROTO_UDP:
case IPPROTO_GRE:
goto mksocket;
default:
break;
}
break;
case GREGPROTO:
ifr->ifr_flags = sp->sp_proto;
break;
case GRESADDRS:
case GRESADDRD:
gre_clearconf(sp, false);
/* set tunnel endpoints and mark interface as up */
switch (cmd) {
case GRESADDRS:
sockaddr_copy(sstosa(&sp->sp_src),
sizeof(sp->sp_src), ifreq_getaddr(cmd, ifr));
break;
case GRESADDRD:
sockaddr_copy(sstosa(&sp->sp_dst),
sizeof(sp->sp_dst), ifreq_getaddr(cmd, ifr));
break;
}
checkaddr:
if (sockaddr_any(sstosa(&sp->sp_src)) == NULL ||
sockaddr_any(sstosa(&sp->sp_dst)) == NULL) {
error = EINVAL;
break;
}
/* let gre_socreate() check the rest */
mksocket:
GRE_DPRINTF(sc, "\n");
/* If we're administratively down, or the configuration
* is empty, there's no use creating a socket.
*/
if ((ifp->if_flags & IFF_UP) == 0 || gre_is_nullconf(sp))
goto sendconf;
GRE_DPRINTF(sc, "\n");
fd = 0;
error = gre_socreate(sc, sp, &fd);
if (error != 0)
break;
setsock:
GRE_DPRINTF(sc, "\n");
error = gre_ssock(ifp, sp, fd);
if (cmd != GRESSOCK) {
GRE_DPRINTF(sc, "\n");
/* XXX v. dodgy */
if (fd_getfile(fd) != NULL)
fd_close(fd);
}
if (error == 0) {
sendconf:
GRE_DPRINTF(sc, "\n");
ifp->if_flags &= ~IFF_RUNNING;
gre_reconf(sc, sp);
}
break;
case GREGADDRS:
ifreq_setaddr(cmd, ifr, sstosa(&sp->sp_src));
break;
case GREGADDRD:
ifreq_setaddr(cmd, ifr, sstosa(&sp->sp_dst));
break;
case GREDSOCK:
GRE_DPRINTF(sc, "\n");
if (sp->sp_bysock)
ifp->if_flags &= ~IFF_UP;
gre_clearconf(sp, false);
goto mksocket;
case GRESSOCK:
GRE_DPRINTF(sc, "\n");
gre_clearconf(sp, true);
fd = (int)ifr->ifr_value;
sp->sp_bysock = true;
ifp->if_flags |= IFF_UP;
goto setsock;
case SIOCSLIFPHYADDR:
GRE_DPRINTF(sc, "\n");
if (lifr->addr.ss_family != lifr->dstaddr.ss_family) {
error = EAFNOSUPPORT;
break;
}
sockaddr_copy(sstosa(&sp->sp_src), sizeof(sp->sp_src),
sstosa(&lifr->addr));
sockaddr_copy(sstosa(&sp->sp_dst), sizeof(sp->sp_dst),
sstosa(&lifr->dstaddr));
GRE_DPRINTF(sc, "\n");
goto checkaddr;
case SIOCDIFPHYADDR:
GRE_DPRINTF(sc, "\n");
gre_clearconf(sp, true);
ifp->if_flags &= ~IFF_UP;
goto mksocket;
case SIOCGLIFPHYADDR:
GRE_DPRINTF(sc, "\n");
if (gre_is_nullconf(sp)) {
error = EADDRNOTAVAIL;
break;
}
sockaddr_copy(sstosa(&lifr->addr), sizeof(lifr->addr),
sstosa(&sp->sp_src));
sockaddr_copy(sstosa(&lifr->dstaddr), sizeof(lifr->dstaddr),
sstosa(&sp->sp_dst));
GRE_DPRINTF(sc, "\n");
break;
default:
error = ifioctl_common(ifp, cmd, data);
break;
}
out:
GRE_DPRINTF(sc, "\n");
splx(s);
gre_ioctl_unlock(sc);
return error;
}
void greattach(int);
/* ARGSUSED */
void
greattach(int count)
{
if_clone_attach(&gre_cloner);
}