NetBSD/sys/netinet6/raw_ip6.c

889 lines
23 KiB
C
Raw Normal View History

2013-11-23 18:20:21 +04:00
/* $NetBSD: raw_ip6.c,v 1.112 2013/11/23 14:20:22 christos Exp $ */
/* $KAME: raw_ip6.c,v 1.82 2001/07/23 18:57:56 jinmei Exp $ */
1999-07-04 01:24:45 +04:00
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1982, 1986, 1988, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)raw_ip.c 8.2 (Berkeley) 1/4/94
*/
2001-11-13 03:56:55 +03:00
#include <sys/cdefs.h>
2013-11-23 18:20:21 +04:00
__KERNEL_RCSID(0, "$NetBSD: raw_ip6.c,v 1.112 2013/11/23 14:20:22 christos Exp $");
2001-11-13 03:56:55 +03:00
#include "opt_ipsec.h"
#include <sys/param.h>
#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/socket.h>
#include <sys/protosw.h>
#include <sys/socketvar.h>
#include <sys/errno.h>
#include <sys/systm.h>
#include <sys/proc.h>
2006-05-15 01:19:33 +04:00
#include <sys/kauth.h>
#include <net/if.h>
#include <net/route.h>
#include <net/if_types.h>
#include <net/net_stats.h>
#include <netinet/in.h>
#include <netinet/in_var.h>
#include <netinet/ip6.h>
#include <netinet6/ip6_var.h>
2008-04-15 07:57:04 +04:00
#include <netinet6/ip6_private.h>
#include <netinet6/ip6_mroute.h>
#include <netinet/icmp6.h>
2008-04-15 07:57:04 +04:00
#include <netinet6/icmp6_private.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/nd6.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/scope6_var.h>
#include <netinet6/raw_ip6.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
#include <netipsec/ipsec_var.h>
#include <netipsec/ipsec_private.h>
#include <netipsec/ipsec6.h>
#endif
#include "faith.h"
#if defined(NFAITH) && 0 < NFAITH
#include <net/if_faith.h>
#endif
extern struct inpcbtable rawcbtable;
struct inpcbtable raw6cbtable;
#define ifatoia6(ifa) ((struct in6_ifaddr *)(ifa))
/*
* Raw interface to IP6 protocol.
*/
2008-04-15 09:13:37 +04:00
static percpu_t *rip6stat_percpu;
#define RIP6_STATINC(x) _NET_STATINC(rip6stat_percpu, x)
static void sysctl_net_inet6_raw6_setup(struct sysctllog **);
/*
* Initialize raw connection block queue.
*/
void
2009-03-16 00:23:31 +03:00
rip6_init(void)
{
2002-06-08 02:08:41 +04:00
sysctl_net_inet6_raw6_setup(NULL);
in6_pcbinit(&raw6cbtable, 1, 1);
2008-04-15 09:13:37 +04:00
rip6stat_percpu = percpu_alloc(sizeof(uint64_t) * RIP6_NSTATS);
}
/*
* Setup generic address and protocol structures
* for raw_input routine, then pass them along with
* mbuf chain.
*/
int
rip6_input(struct mbuf **mp, int *offp, int proto)
{
struct mbuf *m = *mp;
struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *);
struct inpcb_hdr *inph;
struct in6pcb *in6p;
struct in6pcb *last = NULL;
struct sockaddr_in6 rip6src;
struct mbuf *opts = NULL;
2008-04-15 09:13:37 +04:00
RIP6_STATINC(RIP6_STAT_IPACKETS);
#if defined(NFAITH) && 0 < NFAITH
if (faithprefix(&ip6->ip6_dst)) {
/* send icmp6 host unreach? */
m_freem(m);
return IPPROTO_DONE;
}
#endif
/* Be proactive about malicious use of IPv4 mapped address */
if (IN6_IS_ADDR_V4MAPPED(&ip6->ip6_src) ||
IN6_IS_ADDR_V4MAPPED(&ip6->ip6_dst)) {
/* XXX stat */
m_freem(m);
return IPPROTO_DONE;
}
2007-11-07 02:40:38 +03:00
sockaddr_in6_init(&rip6src, &ip6->ip6_src, 0, 0, 0);
if (sa6_recoverscope(&rip6src) != 0) {
/* XXX: should be impossible. */
m_freem(m);
return IPPROTO_DONE;
}
2013-11-23 18:20:21 +04:00
TAILQ_FOREACH(inph, &raw6cbtable.inpt_queue, inph_queue) {
in6p = (struct in6pcb *)inph;
if (in6p->in6p_af != AF_INET6)
continue;
if (in6p->in6p_ip6.ip6_nxt &&
in6p->in6p_ip6.ip6_nxt != proto)
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_laddr) &&
!IN6_ARE_ADDR_EQUAL(&in6p->in6p_laddr, &ip6->ip6_dst))
continue;
if (!IN6_IS_ADDR_UNSPECIFIED(&in6p->in6p_faddr) &&
!IN6_ARE_ADDR_EQUAL(&in6p->in6p_faddr, &ip6->ip6_src))
continue;
if (in6p->in6p_cksum != -1) {
2008-04-15 09:13:37 +04:00
RIP6_STATINC(RIP6_STAT_ISUM);
if (in6_cksum(m, proto, *offp,
m->m_pkthdr.len - *offp)) {
2008-04-15 09:13:37 +04:00
RIP6_STATINC(RIP6_STAT_BADSUM);
continue;
}
}
if (last) {
struct mbuf *n;
#ifdef IPSEC
/*
* Check AH/ESP integrity
*/
if (!ipsec6_in_reject(m,last))
#endif /* IPSEC */
if ((n = m_copy(m, 0, (int)M_COPYALL)) != NULL) {
if (last->in6p_flags & IN6P_CONTROLOPTS)
ip6_savecontrol(last, &opts, ip6, n);
/* strip intermediate headers */
m_adj(n, *offp);
if (sbappendaddr(&last->in6p_socket->so_rcv,
2002-06-08 02:03:02 +04:00
(struct sockaddr *)&rip6src, n, opts) == 0) {
/* should notify about lost packet */
m_freem(n);
if (opts)
m_freem(opts);
2008-04-15 09:13:37 +04:00
RIP6_STATINC(RIP6_STAT_FULLSOCK);
} else
sorwakeup(last->in6p_socket);
opts = NULL;
}
}
last = in6p;
}
#ifdef IPSEC
if (last && ipsec6_in_reject(m, last)) {
m_freem(m);
/*
* XXX ipsec6_in_reject update stat if there is an error
* so we just need to update stats by hand in the case of last is
* NULL
*/
if (!last)
IPSEC6_STATINC(IPSEC_STAT_IN_POLVIO);
2008-04-15 07:57:04 +04:00
IP6_STATDEC(IP6_STAT_DELIVERED);
/* do not inject data into pcb */
} else
#endif /* IPSEC */
if (last) {
if (last->in6p_flags & IN6P_CONTROLOPTS)
ip6_savecontrol(last, &opts, ip6, m);
/* strip intermediate headers */
m_adj(m, *offp);
if (sbappendaddr(&last->in6p_socket->so_rcv,
2002-06-08 02:03:02 +04:00
(struct sockaddr *)&rip6src, m, opts) == 0) {
m_freem(m);
if (opts)
m_freem(opts);
2008-04-15 09:13:37 +04:00
RIP6_STATINC(RIP6_STAT_FULLSOCK);
} else
sorwakeup(last->in6p_socket);
} else {
2008-04-15 09:13:37 +04:00
RIP6_STATINC(RIP6_STAT_NOSOCK);
if (m->m_flags & M_MCAST)
2008-04-15 09:13:37 +04:00
RIP6_STATINC(RIP6_STAT_NOSOCKMCAST);
if (proto == IPPROTO_NONE)
m_freem(m);
else {
u_int8_t *prvnxtp = ip6_get_prevhdr(m, *offp); /* XXX */
in6_ifstat_inc(m->m_pkthdr.rcvif, ifs6_in_protounknown);
icmp6_error(m, ICMP6_PARAM_PROB,
2002-06-08 02:03:02 +04:00
ICMP6_PARAMPROB_NEXTHEADER,
prvnxtp - mtod(m, u_int8_t *));
}
2008-04-15 07:57:04 +04:00
IP6_STATDEC(IP6_STAT_DELIVERED);
}
return IPPROTO_DONE;
}
void *
rip6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
{
struct ip6_hdr *ip6;
struct ip6ctlparam *ip6cp = NULL;
const struct sockaddr_in6 *sa6_src = NULL;
void *cmdarg;
2007-11-01 23:33:56 +03:00
void (*notify)(struct in6pcb *, int) = in6_rtchange;
int nxt;
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
2009-01-03 06:43:21 +03:00
return NULL;
if ((unsigned)cmd >= PRC_NCMDS)
2009-01-03 06:43:21 +03:00
return NULL;
if (PRC_IS_REDIRECT(cmd))
notify = in6_rtchange, d = NULL;
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (cmd == PRC_MSGSIZE)
; /* special code is present, see below */
else if (inet6ctlerrmap[cmd] == 0)
2009-01-03 06:43:21 +03:00
return NULL;
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
ip6cp = (struct ip6ctlparam *)d;
ip6 = ip6cp->ip6c_ip6;
cmdarg = ip6cp->ip6c_cmdarg;
sa6_src = ip6cp->ip6c_src;
nxt = ip6cp->ip6c_nxt;
} else {
ip6 = NULL;
cmdarg = NULL;
sa6_src = &sa6_any;
nxt = -1;
}
if (ip6 && cmd == PRC_MSGSIZE) {
const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
int valid = 0;
struct in6pcb *in6p;
/*
* Check to see if we have a valid raw IPv6 socket
* corresponding to the address in the ICMPv6 message
* payload, and the protocol (ip6_nxt) meets the socket.
* XXX chase extension headers, or pass final nxt value
* from icmp6_notify_error()
*/
in6p = NULL;
in6p = in6_pcblookup_connect(&raw6cbtable, &sa6->sin6_addr, 0,
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
(const struct in6_addr *)&sa6_src->sin6_addr, 0, 0, 0);
#if 0
if (!in6p) {
/*
* As the use of sendto(2) is fairly popular,
* we may want to allow non-connected pcb too.
* But it could be too weak against attacks...
* We should at least check if the local
* address (= s) is really ours.
*/
in6p = in6_pcblookup_bind(&raw6cbtable,
&sa6->sin6_addr, 0, 0);
}
#endif
if (in6p && in6p->in6p_ip6.ip6_nxt &&
in6p->in6p_ip6.ip6_nxt == nxt)
valid++;
/*
* Depending on the value of "valid" and routing table
* size (mtudisc_{hi,lo}wat), we will:
2002-06-08 02:05:37 +04:00
* - recalculate the new MTU and create the
* corresponding routing entry, or
* - ignore the MTU change notification.
*/
icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
/*
* regardless of if we called icmp6_mtudisc_update(),
* we need to call in6_pcbnotify(), to notify path MTU
* change to the userland (RFC3542), because some
* unconnected sockets may share the same destination
* and want to know the path MTU.
*/
}
(void) in6_pcbnotify(&raw6cbtable, sa, 0,
(const struct sockaddr *)sa6_src, 0, cmd, cmdarg, notify);
return NULL;
}
/*
* Generate IPv6 header and pass packet to ip6_output.
* Tack on options user may have setup with control call.
*/
int
rip6_output(struct mbuf *m, struct socket * const so,
struct sockaddr_in6 * const dstsock, struct mbuf * const control)
{
struct in6_addr *dst;
struct ip6_hdr *ip6;
struct in6pcb *in6p;
u_int plen = m->m_pkthdr.len;
int error = 0;
struct ip6_pktopts opt, *optp = NULL;
struct ifnet *oifp = NULL;
int type, code; /* for ICMPv6 output statistics only */
int scope_ambiguous = 0;
struct in6_addr *in6a;
in6p = sotoin6pcb(so);
dst = &dstsock->sin6_addr;
if (control) {
if ((error = ip6_setpktopts(control, &opt,
in6p->in6p_outputopts,
kauth_cred_get(), so->so_proto->pr_protocol)) != 0) {
goto bad;
}
optp = &opt;
} else
optp = in6p->in6p_outputopts;
/*
* Check and convert scope zone ID into internal form.
* XXX: we may still need to determine the zone later.
*/
if (!(so->so_state & SS_ISCONNECTED)) {
if (dstsock->sin6_scope_id == 0 && !ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(dstsock, ip6_use_defzone)) != 0)
goto bad;
}
/*
* For an ICMPv6 packet, we should know its type and code
* to update statistics.
*/
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
struct icmp6_hdr *icmp6;
if (m->m_len < sizeof(struct icmp6_hdr) &&
(m = m_pullup(m, sizeof(struct icmp6_hdr))) == NULL) {
error = ENOBUFS;
goto bad;
}
icmp6 = mtod(m, struct icmp6_hdr *);
type = icmp6->icmp6_type;
code = icmp6->icmp6_code;
2003-10-25 12:26:14 +04:00
} else {
type = 0;
code = 0;
}
2003-05-28 02:36:38 +04:00
M_PREPEND(m, sizeof(*ip6), M_DONTWAIT);
if (!m) {
error = ENOBUFS;
goto bad;
}
ip6 = mtod(m, struct ip6_hdr *);
/*
* Next header might not be ICMP6 but use its pseudo header anyway.
*/
ip6->ip6_dst = *dst;
/*
* Source address selection.
*/
if ((in6a = in6_selectsrc(dstsock, optp, in6p->in6p_moptions,
&in6p->in6p_route, &in6p->in6p_laddr, &oifp,
&error)) == 0) {
if (error == 0)
error = EADDRNOTAVAIL;
goto bad;
}
ip6->ip6_src = *in6a;
if (oifp && scope_ambiguous) {
/*
* Application should provide a proper zone ID or the use of
* default zone IDs should be enabled. Unfortunately, some
* applications do not behave as it should, so we need a
* workaround. Even if an appropriate ID is not determined
* (when it's required), if we can determine the outgoing
* interface. determine the zone ID based on the interface.
*/
error = in6_setscope(&dstsock->sin6_addr, oifp, NULL);
if (error != 0)
goto bad;
}
ip6->ip6_dst = dstsock->sin6_addr;
/* fill in the rest of the IPv6 header fields */
ip6->ip6_flow = in6p->in6p_flowinfo & IPV6_FLOWINFO_MASK;
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
/* ip6_plen will be filled in ip6_output, so not fill it here. */
ip6->ip6_nxt = in6p->in6p_ip6.ip6_nxt;
ip6->ip6_hlim = in6_selecthlim(in6p, oifp);
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6 ||
in6p->in6p_cksum != -1) {
int off;
u_int16_t sum;
/* compute checksum */
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6)
off = offsetof(struct icmp6_hdr, icmp6_cksum);
else
off = in6p->in6p_cksum;
if (plen < off + 1) {
error = EINVAL;
goto bad;
}
off += sizeof(struct ip6_hdr);
sum = 0;
m = m_copyback_cow(m, off, sizeof(sum), (void *)&sum,
M_DONTWAIT);
if (m == NULL) {
error = ENOBUFS;
goto bad;
}
sum = in6_cksum(m, ip6->ip6_nxt, sizeof(*ip6), plen);
m = m_copyback_cow(m, off, sizeof(sum), (void *)&sum,
M_DONTWAIT);
if (m == NULL) {
error = ENOBUFS;
goto bad;
}
}
error = ip6_output(m, optp, &in6p->in6p_route, 0,
in6p->in6p_moptions, so, &oifp);
if (so->so_proto->pr_protocol == IPPROTO_ICMPV6) {
if (oifp)
icmp6_ifoutstat_inc(oifp, type, code);
2008-04-15 07:57:04 +04:00
ICMP6_STATINC(ICMP6_STAT_OUTHIST + type);
} else
2008-04-15 09:13:37 +04:00
RIP6_STATINC(RIP6_STAT_OPACKETS);
goto freectl;
bad:
if (m)
m_freem(m);
freectl:
if (control) {
ip6_clearpktopts(&opt, -1);
m_freem(control);
}
return error;
}
/*
* Raw IPv6 socket option processing.
*/
int
rip6_ctloutput(int op, struct socket *so, struct sockopt *sopt)
{
int error = 0;
if (sopt->sopt_level == SOL_SOCKET && sopt->sopt_name == SO_NOHEADER) {
int optval;
1) Introduce a new socket option, (SOL_SOCKET, SO_NOHEADER), that tells a socket that it should both add a protocol header to tx'd datagrams and remove the header from rx'd datagrams: int onoff = 1, s = socket(...); setsockopt(s, SOL_SOCKET, SO_NOHEADER, &onoff); 2) Add an implementation of (SOL_SOCKET, SO_NOHEADER) for raw IPv4 sockets. 3) Reorganize the protocols' pr_ctloutput implementations a bit. Consistently return ENOPROTOOPT when an option is unsupported, and EINVAL if a supported option's arguments are incorrect. Reorganize the flow of code so that it's more clear how/when options are passed down the stack until they are handled. Shorten some pr_ctloutput staircases for readability. 4) Extract common mbuf code into subroutines, add new sockaddr methods, and introduce a new subroutine, fsocreate(), for reuse later; use it first in sys_socket(): struct mbuf *m_getsombuf(struct socket *so) Create an mbuf and make its owner the socket `so'. struct mbuf *m_intopt(struct socket *so, int val) Create an mbuf, make its owner the socket `so', put the int `val' into it, and set its length to sizeof(int). int fsocreate(..., int *fd) Create a socket, a la socreate(9), put the socket into the given LWP's descriptor table, return the descriptor at `fd' on success. void *sockaddr_addr(struct sockaddr *sa, socklen_t *slenp) const void *sockaddr_const_addr(const struct sockaddr *sa, socklen_t *slenp) Extract a pointer to the address part of a sockaddr. Write the length of the address part at `slenp', if `slenp' is not NULL. socklen_t sockaddr_getlen(const struct sockaddr *sa) Return the length of a sockaddr. This just evaluates to sa->sa_len. I only add this for consistency with code that appears in a portable userland library that I am going to import. const struct sockaddr *sockaddr_any(const struct sockaddr *sa) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses. const void *sockaddr_anyaddr(const struct sockaddr *sa, socklen_t *slenp) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses.
2007-09-19 08:33:42 +04:00
/* need to fiddle w/ opt(IPPROTO_IPV6, IPV6_CHECKSUM)? */
if (op == PRCO_GETOPT) {
optval = 1;
error = sockopt_set(sopt, &optval, sizeof(optval));
} else if (op == PRCO_SETOPT) {
error = sockopt_getint(sopt, &optval);
if (error)
goto out;
if (optval == 0)
error = EINVAL;
}
goto out;
} else if (sopt->sopt_level != IPPROTO_IPV6)
return ip6_ctloutput(op, so, sopt);
1) Introduce a new socket option, (SOL_SOCKET, SO_NOHEADER), that tells a socket that it should both add a protocol header to tx'd datagrams and remove the header from rx'd datagrams: int onoff = 1, s = socket(...); setsockopt(s, SOL_SOCKET, SO_NOHEADER, &onoff); 2) Add an implementation of (SOL_SOCKET, SO_NOHEADER) for raw IPv4 sockets. 3) Reorganize the protocols' pr_ctloutput implementations a bit. Consistently return ENOPROTOOPT when an option is unsupported, and EINVAL if a supported option's arguments are incorrect. Reorganize the flow of code so that it's more clear how/when options are passed down the stack until they are handled. Shorten some pr_ctloutput staircases for readability. 4) Extract common mbuf code into subroutines, add new sockaddr methods, and introduce a new subroutine, fsocreate(), for reuse later; use it first in sys_socket(): struct mbuf *m_getsombuf(struct socket *so) Create an mbuf and make its owner the socket `so'. struct mbuf *m_intopt(struct socket *so, int val) Create an mbuf, make its owner the socket `so', put the int `val' into it, and set its length to sizeof(int). int fsocreate(..., int *fd) Create a socket, a la socreate(9), put the socket into the given LWP's descriptor table, return the descriptor at `fd' on success. void *sockaddr_addr(struct sockaddr *sa, socklen_t *slenp) const void *sockaddr_const_addr(const struct sockaddr *sa, socklen_t *slenp) Extract a pointer to the address part of a sockaddr. Write the length of the address part at `slenp', if `slenp' is not NULL. socklen_t sockaddr_getlen(const struct sockaddr *sa) Return the length of a sockaddr. This just evaluates to sa->sa_len. I only add this for consistency with code that appears in a portable userland library that I am going to import. const struct sockaddr *sockaddr_any(const struct sockaddr *sa) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses. const void *sockaddr_anyaddr(const struct sockaddr *sa, socklen_t *slenp) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses.
2007-09-19 08:33:42 +04:00
switch (sopt->sopt_name) {
1) Introduce a new socket option, (SOL_SOCKET, SO_NOHEADER), that tells a socket that it should both add a protocol header to tx'd datagrams and remove the header from rx'd datagrams: int onoff = 1, s = socket(...); setsockopt(s, SOL_SOCKET, SO_NOHEADER, &onoff); 2) Add an implementation of (SOL_SOCKET, SO_NOHEADER) for raw IPv4 sockets. 3) Reorganize the protocols' pr_ctloutput implementations a bit. Consistently return ENOPROTOOPT when an option is unsupported, and EINVAL if a supported option's arguments are incorrect. Reorganize the flow of code so that it's more clear how/when options are passed down the stack until they are handled. Shorten some pr_ctloutput staircases for readability. 4) Extract common mbuf code into subroutines, add new sockaddr methods, and introduce a new subroutine, fsocreate(), for reuse later; use it first in sys_socket(): struct mbuf *m_getsombuf(struct socket *so) Create an mbuf and make its owner the socket `so'. struct mbuf *m_intopt(struct socket *so, int val) Create an mbuf, make its owner the socket `so', put the int `val' into it, and set its length to sizeof(int). int fsocreate(..., int *fd) Create a socket, a la socreate(9), put the socket into the given LWP's descriptor table, return the descriptor at `fd' on success. void *sockaddr_addr(struct sockaddr *sa, socklen_t *slenp) const void *sockaddr_const_addr(const struct sockaddr *sa, socklen_t *slenp) Extract a pointer to the address part of a sockaddr. Write the length of the address part at `slenp', if `slenp' is not NULL. socklen_t sockaddr_getlen(const struct sockaddr *sa) Return the length of a sockaddr. This just evaluates to sa->sa_len. I only add this for consistency with code that appears in a portable userland library that I am going to import. const struct sockaddr *sockaddr_any(const struct sockaddr *sa) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses. const void *sockaddr_anyaddr(const struct sockaddr *sa, socklen_t *slenp) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses.
2007-09-19 08:33:42 +04:00
case MRT6_INIT:
case MRT6_DONE:
case MRT6_ADD_MIF:
case MRT6_DEL_MIF:
case MRT6_ADD_MFC:
case MRT6_DEL_MFC:
case MRT6_PIM:
if (op == PRCO_SETOPT)
error = ip6_mrouter_set(so, sopt);
1) Introduce a new socket option, (SOL_SOCKET, SO_NOHEADER), that tells a socket that it should both add a protocol header to tx'd datagrams and remove the header from rx'd datagrams: int onoff = 1, s = socket(...); setsockopt(s, SOL_SOCKET, SO_NOHEADER, &onoff); 2) Add an implementation of (SOL_SOCKET, SO_NOHEADER) for raw IPv4 sockets. 3) Reorganize the protocols' pr_ctloutput implementations a bit. Consistently return ENOPROTOOPT when an option is unsupported, and EINVAL if a supported option's arguments are incorrect. Reorganize the flow of code so that it's more clear how/when options are passed down the stack until they are handled. Shorten some pr_ctloutput staircases for readability. 4) Extract common mbuf code into subroutines, add new sockaddr methods, and introduce a new subroutine, fsocreate(), for reuse later; use it first in sys_socket(): struct mbuf *m_getsombuf(struct socket *so) Create an mbuf and make its owner the socket `so'. struct mbuf *m_intopt(struct socket *so, int val) Create an mbuf, make its owner the socket `so', put the int `val' into it, and set its length to sizeof(int). int fsocreate(..., int *fd) Create a socket, a la socreate(9), put the socket into the given LWP's descriptor table, return the descriptor at `fd' on success. void *sockaddr_addr(struct sockaddr *sa, socklen_t *slenp) const void *sockaddr_const_addr(const struct sockaddr *sa, socklen_t *slenp) Extract a pointer to the address part of a sockaddr. Write the length of the address part at `slenp', if `slenp' is not NULL. socklen_t sockaddr_getlen(const struct sockaddr *sa) Return the length of a sockaddr. This just evaluates to sa->sa_len. I only add this for consistency with code that appears in a portable userland library that I am going to import. const struct sockaddr *sockaddr_any(const struct sockaddr *sa) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses. const void *sockaddr_anyaddr(const struct sockaddr *sa, socklen_t *slenp) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses.
2007-09-19 08:33:42 +04:00
else if (op == PRCO_GETOPT)
error = ip6_mrouter_get(so, sopt);
1) Introduce a new socket option, (SOL_SOCKET, SO_NOHEADER), that tells a socket that it should both add a protocol header to tx'd datagrams and remove the header from rx'd datagrams: int onoff = 1, s = socket(...); setsockopt(s, SOL_SOCKET, SO_NOHEADER, &onoff); 2) Add an implementation of (SOL_SOCKET, SO_NOHEADER) for raw IPv4 sockets. 3) Reorganize the protocols' pr_ctloutput implementations a bit. Consistently return ENOPROTOOPT when an option is unsupported, and EINVAL if a supported option's arguments are incorrect. Reorganize the flow of code so that it's more clear how/when options are passed down the stack until they are handled. Shorten some pr_ctloutput staircases for readability. 4) Extract common mbuf code into subroutines, add new sockaddr methods, and introduce a new subroutine, fsocreate(), for reuse later; use it first in sys_socket(): struct mbuf *m_getsombuf(struct socket *so) Create an mbuf and make its owner the socket `so'. struct mbuf *m_intopt(struct socket *so, int val) Create an mbuf, make its owner the socket `so', put the int `val' into it, and set its length to sizeof(int). int fsocreate(..., int *fd) Create a socket, a la socreate(9), put the socket into the given LWP's descriptor table, return the descriptor at `fd' on success. void *sockaddr_addr(struct sockaddr *sa, socklen_t *slenp) const void *sockaddr_const_addr(const struct sockaddr *sa, socklen_t *slenp) Extract a pointer to the address part of a sockaddr. Write the length of the address part at `slenp', if `slenp' is not NULL. socklen_t sockaddr_getlen(const struct sockaddr *sa) Return the length of a sockaddr. This just evaluates to sa->sa_len. I only add this for consistency with code that appears in a portable userland library that I am going to import. const struct sockaddr *sockaddr_any(const struct sockaddr *sa) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses. const void *sockaddr_anyaddr(const struct sockaddr *sa, socklen_t *slenp) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses.
2007-09-19 08:33:42 +04:00
else
error = EINVAL;
break;
case IPV6_CHECKSUM:
return ip6_raw_ctloutput(op, so, sopt);
default:
return ip6_ctloutput(op, so, sopt);
}
out:
1) Introduce a new socket option, (SOL_SOCKET, SO_NOHEADER), that tells a socket that it should both add a protocol header to tx'd datagrams and remove the header from rx'd datagrams: int onoff = 1, s = socket(...); setsockopt(s, SOL_SOCKET, SO_NOHEADER, &onoff); 2) Add an implementation of (SOL_SOCKET, SO_NOHEADER) for raw IPv4 sockets. 3) Reorganize the protocols' pr_ctloutput implementations a bit. Consistently return ENOPROTOOPT when an option is unsupported, and EINVAL if a supported option's arguments are incorrect. Reorganize the flow of code so that it's more clear how/when options are passed down the stack until they are handled. Shorten some pr_ctloutput staircases for readability. 4) Extract common mbuf code into subroutines, add new sockaddr methods, and introduce a new subroutine, fsocreate(), for reuse later; use it first in sys_socket(): struct mbuf *m_getsombuf(struct socket *so) Create an mbuf and make its owner the socket `so'. struct mbuf *m_intopt(struct socket *so, int val) Create an mbuf, make its owner the socket `so', put the int `val' into it, and set its length to sizeof(int). int fsocreate(..., int *fd) Create a socket, a la socreate(9), put the socket into the given LWP's descriptor table, return the descriptor at `fd' on success. void *sockaddr_addr(struct sockaddr *sa, socklen_t *slenp) const void *sockaddr_const_addr(const struct sockaddr *sa, socklen_t *slenp) Extract a pointer to the address part of a sockaddr. Write the length of the address part at `slenp', if `slenp' is not NULL. socklen_t sockaddr_getlen(const struct sockaddr *sa) Return the length of a sockaddr. This just evaluates to sa->sa_len. I only add this for consistency with code that appears in a portable userland library that I am going to import. const struct sockaddr *sockaddr_any(const struct sockaddr *sa) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses. const void *sockaddr_anyaddr(const struct sockaddr *sa, socklen_t *slenp) Return the "don't care" sockaddr in the same family as `sa'. This is the address a client should sobind(9) if it does not care the source address and, if applicable, the port et cetera that it uses.
2007-09-19 08:33:42 +04:00
return error;
}
extern u_long rip6_sendspace;
extern u_long rip6_recvspace;
int
rip6_usrreq(struct socket *so, int req, struct mbuf *m,
struct mbuf *nam, struct mbuf *control, struct lwp *l)
{
struct in6pcb *in6p = sotoin6pcb(so);
int s;
int error = 0;
if (req == PRU_CONTROL)
return in6_control(so, (u_long)m, (void *)nam,
(struct ifnet *)control, l);
if (req == PRU_PURGEIF) {
mutex_enter(softnet_lock);
in6_pcbpurgeif0(&raw6cbtable, (struct ifnet *)control);
in6_purgeif((struct ifnet *)control);
in6_pcbpurgeif(&raw6cbtable, (struct ifnet *)control);
mutex_exit(softnet_lock);
return 0;
}
switch (req) {
case PRU_ATTACH:
error = kauth_authorize_network(l->l_cred,
KAUTH_NETWORK_SOCKET, KAUTH_REQ_NETWORK_SOCKET_RAWSOCK,
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
KAUTH_ARG(AF_INET6),
KAUTH_ARG(SOCK_RAW),
KAUTH_ARG(so->so_proto->pr_protocol));
sosetlock(so);
if (in6p != NULL)
panic("rip6_attach");
if (error) {
break;
}
s = splsoftnet();
error = soreserve(so, rip6_sendspace, rip6_recvspace);
if (error != 0) {
splx(s);
break;
}
if ((error = in6_pcballoc(so, &raw6cbtable)) != 0) {
splx(s);
break;
}
splx(s);
in6p = sotoin6pcb(so);
in6p->in6p_ip6.ip6_nxt = (long)nam;
in6p->in6p_cksum = -1;
2002-06-09 18:43:10 +04:00
2008-12-17 23:51:31 +03:00
in6p->in6p_icmp6filt = malloc(sizeof(struct icmp6_filter),
M_PCB, M_NOWAIT);
if (in6p->in6p_icmp6filt == NULL) {
in6_pcbdetach(in6p);
error = ENOMEM;
break;
}
ICMP6_FILTER_SETPASSALL(in6p->in6p_icmp6filt);
break;
case PRU_DISCONNECT:
if ((so->so_state & SS_ISCONNECTED) == 0) {
error = ENOTCONN;
break;
}
in6p->in6p_faddr = in6addr_any;
so->so_state &= ~SS_ISCONNECTED; /* XXX */
break;
case PRU_ABORT:
soisdisconnected(so);
/* Fallthrough */
case PRU_DETACH:
if (in6p == NULL)
panic("rip6_detach");
if (so == ip6_mrouter)
ip6_mrouter_done();
/* xxx: RSVP */
if (in6p->in6p_icmp6filt != NULL) {
2008-12-17 23:51:31 +03:00
free(in6p->in6p_icmp6filt, M_PCB);
in6p->in6p_icmp6filt = NULL;
}
in6_pcbdetach(in6p);
break;
case PRU_BIND:
{
struct sockaddr_in6 *addr = mtod(nam, struct sockaddr_in6 *);
struct ifaddr *ia = NULL;
if (nam->m_len != sizeof(*addr)) {
error = EINVAL;
break;
}
if (TAILQ_EMPTY(&ifnet) || addr->sin6_family != AF_INET6) {
error = EADDRNOTAVAIL;
break;
}
if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0)
break;
/*
* we don't support mapped address here, it would confuse
* users so reject it
*/
if (IN6_IS_ADDR_V4MAPPED(&addr->sin6_addr)) {
error = EADDRNOTAVAIL;
break;
}
if (!IN6_IS_ADDR_UNSPECIFIED(&addr->sin6_addr) &&
(ia = ifa_ifwithaddr((struct sockaddr *)addr)) == 0) {
error = EADDRNOTAVAIL;
break;
}
2002-06-08 02:07:38 +04:00
if (ia && ((struct in6_ifaddr *)ia)->ia6_flags &
(IN6_IFF_ANYCAST|IN6_IFF_NOTREADY|
IN6_IFF_DETACHED|IN6_IFF_DEPRECATED)) {
error = EADDRNOTAVAIL;
break;
}
in6p->in6p_laddr = addr->sin6_addr;
break;
}
case PRU_CONNECT:
{
struct sockaddr_in6 *addr = mtod(nam, struct sockaddr_in6 *);
struct in6_addr *in6a = NULL;
struct ifnet *ifp = NULL;
int scope_ambiguous = 0;
if (nam->m_len != sizeof(*addr)) {
error = EINVAL;
break;
}
if (TAILQ_EMPTY(&ifnet)) {
error = EADDRNOTAVAIL;
break;
}
if (addr->sin6_family != AF_INET6) {
error = EAFNOSUPPORT;
break;
}
/*
* Application should provide a proper zone ID or the use of
* default zone IDs should be enabled. Unfortunately, some
* applications do not behave as it should, so we need a
* workaround. Even if an appropriate ID is not determined,
* we'll see if we can determine the outgoing interface. If we
* can, determine the zone ID based on the interface below.
*/
if (addr->sin6_scope_id == 0 && !ip6_use_defzone)
scope_ambiguous = 1;
if ((error = sa6_embedscope(addr, ip6_use_defzone)) != 0)
return error;
/* Source address selection. XXX: need pcblookup? */
in6a = in6_selectsrc(addr, in6p->in6p_outputopts,
in6p->in6p_moptions, &in6p->in6p_route,
&in6p->in6p_laddr, &ifp, &error);
if (in6a == NULL) {
if (error == 0)
error = EADDRNOTAVAIL;
break;
}
/* XXX: see above */
if (ifp && scope_ambiguous &&
(error = in6_setscope(&addr->sin6_addr, ifp, NULL)) != 0) {
break;
}
in6p->in6p_laddr = *in6a;
in6p->in6p_faddr = addr->sin6_addr;
soisconnected(so);
break;
}
case PRU_CONNECT2:
error = EOPNOTSUPP;
break;
/*
* Mark the connection as being incapable of futther input.
*/
case PRU_SHUTDOWN:
socantsendmore(so);
break;
/*
* Ship a packet out. The appropriate raw output
* routine handles any messaging necessary.
*/
case PRU_SEND:
{
struct sockaddr_in6 tmp;
struct sockaddr_in6 *dst;
/* always copy sockaddr to avoid overwrites */
if (so->so_state & SS_ISCONNECTED) {
if (nam) {
error = EISCONN;
break;
}
/* XXX */
2007-11-07 02:40:38 +03:00
sockaddr_in6_init(&tmp, &in6p->in6p_faddr, 0, 0, 0);
dst = &tmp;
} else {
if (nam == NULL) {
error = ENOTCONN;
break;
}
if (nam->m_len != sizeof(tmp)) {
error = EINVAL;
break;
}
tmp = *mtod(nam, struct sockaddr_in6 *);
dst = &tmp;
if (dst->sin6_family != AF_INET6) {
error = EAFNOSUPPORT;
break;
}
}
error = rip6_output(m, so, dst, control);
m = NULL;
break;
}
case PRU_SENSE:
/*
* stat: don't bother with a blocksize
*/
return 0;
/*
* Not supported.
*/
case PRU_RCVOOB:
case PRU_RCVD:
case PRU_LISTEN:
case PRU_ACCEPT:
case PRU_SENDOOB:
error = EOPNOTSUPP;
break;
case PRU_SOCKADDR:
in6_setsockaddr(in6p, nam);
break;
case PRU_PEERADDR:
in6_setpeeraddr(in6p, nam);
break;
default:
panic("rip6_usrreq");
}
if (m != NULL)
m_freem(m);
return error;
}
2008-04-15 09:13:37 +04:00
static int
sysctl_net_inet6_raw6_stats(SYSCTLFN_ARGS)
{
return (NETSTAT_SYSCTL(rip6stat_percpu, RIP6_NSTATS));
2008-04-15 09:13:37 +04:00
}
static void
sysctl_net_inet6_raw6_setup(struct sysctllog **clog)
{
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "net", NULL,
NULL, 0, NULL, 0,
CTL_NET, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "inet6", NULL,
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_NODE, "raw6",
SYSCTL_DESCR("Raw IPv6 settings"),
NULL, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_RAW, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "pcblist",
SYSCTL_DESCR("Raw IPv6 control block list"),
sysctl_inpcblist, 0, &raw6cbtable, 0,
CTL_NET, PF_INET6, IPPROTO_RAW,
CTL_CREATE, CTL_EOL);
sysctl_createv(clog, 0, NULL, NULL,
CTLFLAG_PERMANENT,
CTLTYPE_STRUCT, "stats",
SYSCTL_DESCR("Raw IPv6 statistics"),
2008-04-15 09:13:37 +04:00
sysctl_net_inet6_raw6_stats, 0, NULL, 0,
CTL_NET, PF_INET6, IPPROTO_RAW, RAW6CTL_STATS,
CTL_EOL);
}