NetBSD/sys/netinet/tcp_subr.c

2192 lines
57 KiB
C
Raw Normal View History

2022-11-04 12:01:53 +03:00
/* $NetBSD: tcp_subr.c,v 1.296 2022/11/04 09:01:53 ozaki-r Exp $ */
/*
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
* All rights reserved.
2002-06-09 20:33:36 +04:00
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the project nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
2002-06-09 20:33:36 +04:00
*
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Copyright (c) 1997, 1998, 2000, 2001, 2008 The NetBSD Foundation, Inc.
* All rights reserved.
*
* This code is derived from software contributed to The NetBSD Foundation
* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
* Facility, NASA Ames Research Center.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
*
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*/
1993-03-21 12:45:37 +03:00
/*
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
* The Regents of the University of California. All rights reserved.
1993-03-21 12:45:37 +03:00
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
1993-03-21 12:45:37 +03:00
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
* @(#)tcp_subr.c 8.2 (Berkeley) 5/24/95
1993-03-21 12:45:37 +03:00
*/
2001-11-13 03:32:34 +03:00
#include <sys/cdefs.h>
2022-11-04 12:01:53 +03:00
__KERNEL_RCSID(0, "$NetBSD: tcp_subr.c,v 1.296 2022/11/04 09:01:53 ozaki-r Exp $");
2001-11-13 03:32:34 +03:00
2015-08-25 01:21:26 +03:00
#ifdef _KERNEL_OPT
#include "opt_inet.h"
#include "opt_ipsec.h"
#include "opt_inet_csum.h"
#include "opt_mbuftrace.h"
2015-08-25 01:21:26 +03:00
#endif
1993-12-18 03:40:47 +03:00
#include <sys/param.h>
2014-01-02 22:52:04 +04:00
#include <sys/atomic.h>
#include <sys/proc.h>
1993-12-18 03:40:47 +03:00
#include <sys/systm.h>
#include <sys/mbuf.h>
#include <sys/once.h>
1993-12-18 03:40:47 +03:00
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/protosw.h>
#include <sys/errno.h>
#include <sys/kernel.h>
1998-08-02 04:36:19 +04:00
#include <sys/pool.h>
#include <sys/md5.h>
First step of random number subsystem rework described in <20111022023242.BA26F14A158@mail.netbsd.org>. This change includes the following: An initial cleanup and minor reorganization of the entropy pool code in sys/dev/rnd.c and sys/dev/rndpool.c. Several bugs are fixed. Some effort is made to accumulate entropy more quickly at boot time. A generic interface, "rndsink", is added, for stream generators to request that they be re-keyed with good quality entropy from the pool as soon as it is available. The arc4random()/arc4randbytes() implementation in libkern is adjusted to use the rndsink interface for rekeying, which helps address the problem of low-quality keys at boot time. An implementation of the FIPS 140-2 statistical tests for random number generator quality is provided (libkern/rngtest.c). This is based on Greg Rose's implementation from Qualcomm. A new random stream generator, nist_ctr_drbg, is provided. It is based on an implementation of the NIST SP800-90 CTR_DRBG by Henric Jungheim. This generator users AES in a modified counter mode to generate a backtracking-resistant random stream. An abstraction layer, "cprng", is provided for in-kernel consumers of randomness. The arc4random/arc4randbytes API is deprecated for in-kernel use. It is replaced by "cprng_strong". The current cprng_fast implementation wraps the existing arc4random implementation. The current cprng_strong implementation wraps the new CTR_DRBG implementation. Both interfaces are rekeyed from the entropy pool automatically at intervals justifiable from best current cryptographic practice. In some quick tests, cprng_fast() is about the same speed as the old arc4randbytes(), and cprng_strong() is about 20% faster than rnd_extract_data(). Performance is expected to improve. The AES code in src/crypto/rijndael is no longer an optional kernel component, as it is required by cprng_strong, which is not an optional kernel component. The entropy pool output is subjected to the rngtest tests at startup time; if it fails, the system will reboot. There is approximately a 3/10000 chance of a false positive from these tests. Entropy pool _input_ from hardware random numbers is subjected to the rngtest tests at attach time, as well as the FIPS continuous-output test, to detect bad or stuck hardware RNGs; if any are detected, they are detached, but the system continues to run. A problem with rndctl(8) is fixed -- datastructures with pointers in arrays are no longer passed to userspace (this was not a security problem, but rather a major issue for compat32). A new kernel will require a new rndctl. The sysctl kern.arandom() and kern.urandom() nodes are hooked up to the new generators, but the /dev/*random pseudodevices are not, yet. Manual pages for the new kernel interfaces are forthcoming.
2011-11-20 02:51:18 +04:00
#include <sys/cprng.h>
1993-03-21 12:45:37 +03:00
1993-12-18 03:40:47 +03:00
#include <net/route.h>
#include <net/if.h>
1993-03-21 12:45:37 +03:00
1993-12-18 03:40:47 +03:00
#include <netinet/in.h>
#include <netinet/in_systm.h>
#include <netinet/ip.h>
#include <netinet/in_pcb.h>
#include <netinet/ip_var.h>
#include <netinet/ip_icmp.h>
#ifdef INET6
#include <netinet/ip6.h>
#include <netinet6/in6_pcb.h>
#include <netinet6/ip6_var.h>
#include <netinet6/in6_var.h>
#include <netinet6/ip6protosw.h>
#include <netinet/icmp6.h>
#include <netinet6/nd6.h>
#endif
1993-12-18 03:40:47 +03:00
#include <netinet/tcp.h>
#include <netinet/tcp_fsm.h>
#include <netinet/tcp_seq.h>
#include <netinet/tcp_timer.h>
#include <netinet/tcp_var.h>
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
#include <netinet/tcp_vtw.h>
#include <netinet/tcp_private.h>
#include <netinet/tcp_congctl.h>
#include <netinet/tcp_syncache.h>
1993-03-21 12:45:37 +03:00
#ifdef IPSEC
#include <netipsec/ipsec.h>
#ifdef INET6
#include <netipsec/ipsec6.h>
#endif
#include <netipsec/key.h>
#endif
2002-05-13 00:33:50 +04:00
struct inpcbtable tcbtable; /* head of queue of active tcpcb's */
u_int32_t tcp_now; /* slow ticks, for RFC 1323 timestamps */
2002-05-13 00:33:50 +04:00
percpu_t *tcpstat_percpu;
1993-03-21 12:45:37 +03:00
/* patchable/settable parameters for tcp */
int tcp_mssdflt = TCP_MSS;
int tcp_minmss = TCP_MINMSS;
1993-03-21 12:45:37 +03:00
int tcp_rttdflt = TCPTV_SRTTDFLT / PR_SLOWHZ;
int tcp_do_rfc1323 = 1; /* window scaling / timestamps (obsolete) */
int tcp_do_rfc1948 = 0; /* ISS by cryptographic hash */
int tcp_do_sack = 1; /* selective acknowledgement */
int tcp_do_win_scale = 1; /* RFC1323 window scaling */
int tcp_do_timestamps = 1; /* RFC1323 timestamps */
int tcp_ack_on_push = 0; /* set to enable immediate ACK-on-PUSH */
int tcp_do_ecn = 0; /* Explicit Congestion Notification */
#ifndef TCP_INIT_WIN
#define TCP_INIT_WIN 4 /* initial slow start window */
#endif
#ifndef TCP_INIT_WIN_LOCAL
#define TCP_INIT_WIN_LOCAL 4 /* initial slow start window for local nets */
#endif
/*
* Up to 5 we scale linearly, to reach 3 * 1460; then (iw) * 1460.
* This is to simulate current behavior for iw == 4
*/
int tcp_init_win_max[] = {
1 * 1460,
1 * 1460,
2 * 1460,
2 * 1460,
3 * 1460,
5 * 1460,
6 * 1460,
7 * 1460,
8 * 1460,
9 * 1460,
10 * 1460
};
int tcp_init_win = TCP_INIT_WIN;
int tcp_init_win_local = TCP_INIT_WIN_LOCAL;
int tcp_mss_ifmtu = 0;
int tcp_rst_ppslim = 100; /* 100pps */
int tcp_ackdrop_ppslim = 100; /* 100pps */
int tcp_do_loopback_cksum = 0;
int tcp_do_abc = 1; /* RFC3465 Appropriate byte counting. */
int tcp_abc_aggressive = 1; /* 1: L=2*SMSS 0: L=1*SMSS */
int tcp_sack_tp_maxholes = 32;
int tcp_sack_globalmaxholes = 1024;
int tcp_sack_globalholes = 0;
int tcp_ecn_maxretries = 1;
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
int tcp_msl_enable = 1; /* enable TIME_WAIT truncation */
int tcp_msl_loop = PR_SLOWHZ; /* MSL for loopback */
int tcp_msl_local = 5 * PR_SLOWHZ; /* MSL for 'local' */
int tcp_msl_remote = TCPTV_MSL; /* MSL otherwise */
int tcp_msl_remote_threshold = TCPTV_SRTTDFLT; /* RTT threshold */
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
int tcp_rttlocal = 0; /* Use RTT to decide who's 'local' */
int tcp4_vtw_enable = 0; /* 1 to enable */
int tcp6_vtw_enable = 0; /* 1 to enable */
int tcp_vtw_was_enabled = 0;
int tcp_vtw_entries = 1 << 4; /* 16 vestigial TIME_WAIT entries */
/* tcb hash */
#ifndef TCBHASHSIZE
#define TCBHASHSIZE 128
#endif
int tcbhashsize = TCBHASHSIZE;
1993-03-21 12:45:37 +03:00
2005-02-03 00:41:55 +03:00
int tcp_freeq(struct tcpcb *);
static int tcp_iss_secret_init(void);
1997-12-10 04:58:07 +03:00
static void tcp_mtudisc_callback(struct in_addr);
#ifdef INET6
static void tcp6_mtudisc(struct inpcb *, int);
#endif
2008-10-13 23:44:21 +04:00
static struct pool tcpcb_pool;
1998-08-02 04:36:19 +04:00
static int tcp_drainwanted;
#ifdef TCP_CSUM_COUNTERS
#include <sys/device.h>
struct evcnt tcp_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "hwcsum bad");
struct evcnt tcp_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "hwcsum ok");
struct evcnt tcp_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "hwcsum data");
struct evcnt tcp_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "swcsum");
2004-05-01 06:20:42 +04:00
EVCNT_ATTACH_STATIC(tcp_hwcsum_bad);
EVCNT_ATTACH_STATIC(tcp_hwcsum_ok);
EVCNT_ATTACH_STATIC(tcp_hwcsum_data);
EVCNT_ATTACH_STATIC(tcp_swcsum);
#if defined(INET6)
struct evcnt tcp6_hwcsum_bad = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "hwcsum bad");
struct evcnt tcp6_hwcsum_ok = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "hwcsum ok");
struct evcnt tcp6_hwcsum_data = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "hwcsum data");
struct evcnt tcp6_swcsum = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp6", "swcsum");
EVCNT_ATTACH_STATIC(tcp6_hwcsum_bad);
EVCNT_ATTACH_STATIC(tcp6_hwcsum_ok);
EVCNT_ATTACH_STATIC(tcp6_hwcsum_data);
EVCNT_ATTACH_STATIC(tcp6_swcsum);
#endif /* defined(INET6) */
#endif /* TCP_CSUM_COUNTERS */
2004-05-01 06:20:42 +04:00
#ifdef TCP_OUTPUT_COUNTERS
#include <sys/device.h>
struct evcnt tcp_output_bigheader = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output big header");
struct evcnt tcp_output_predict_hit = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output predict hit");
struct evcnt tcp_output_predict_miss = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output predict miss");
struct evcnt tcp_output_copysmall = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output copy small");
struct evcnt tcp_output_copybig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output copy big");
struct evcnt tcp_output_refbig = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp", "output reference big");
2004-05-01 06:20:42 +04:00
EVCNT_ATTACH_STATIC(tcp_output_bigheader);
EVCNT_ATTACH_STATIC(tcp_output_predict_hit);
EVCNT_ATTACH_STATIC(tcp_output_predict_miss);
EVCNT_ATTACH_STATIC(tcp_output_copysmall);
EVCNT_ATTACH_STATIC(tcp_output_copybig);
EVCNT_ATTACH_STATIC(tcp_output_refbig);
#endif /* TCP_OUTPUT_COUNTERS */
#ifdef TCP_REASS_COUNTERS
#include <sys/device.h>
struct evcnt tcp_reass_ = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
NULL, "tcp_reass", "calls");
struct evcnt tcp_reass_empty = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "insert into empty queue");
struct evcnt tcp_reass_iteration[8] = {
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", ">7 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "1 iteration"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "2 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "3 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "4 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "5 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "6 iterations"),
EVCNT_INITIALIZER(EVCNT_TYPE_MISC, &tcp_reass_, "tcp_reass", "7 iterations"),
};
struct evcnt tcp_reass_prependfirst = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "prepend to first");
struct evcnt tcp_reass_prepend = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "prepend");
struct evcnt tcp_reass_insert = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "insert");
struct evcnt tcp_reass_inserttail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "insert at tail");
struct evcnt tcp_reass_append = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "append");
struct evcnt tcp_reass_appendtail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "append to tail fragment");
struct evcnt tcp_reass_overlaptail = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "overlap at end");
struct evcnt tcp_reass_overlapfront = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "overlap at start");
struct evcnt tcp_reass_segdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "duplicate segment");
struct evcnt tcp_reass_fragdup = EVCNT_INITIALIZER(EVCNT_TYPE_MISC,
&tcp_reass_, "tcp_reass", "duplicate fragment");
2004-05-01 06:20:42 +04:00
EVCNT_ATTACH_STATIC(tcp_reass_);
EVCNT_ATTACH_STATIC(tcp_reass_empty);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 0);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 1);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 2);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 3);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 4);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 5);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 6);
EVCNT_ATTACH_STATIC2(tcp_reass_iteration, 7);
EVCNT_ATTACH_STATIC(tcp_reass_prependfirst);
EVCNT_ATTACH_STATIC(tcp_reass_prepend);
EVCNT_ATTACH_STATIC(tcp_reass_insert);
EVCNT_ATTACH_STATIC(tcp_reass_inserttail);
EVCNT_ATTACH_STATIC(tcp_reass_append);
EVCNT_ATTACH_STATIC(tcp_reass_appendtail);
EVCNT_ATTACH_STATIC(tcp_reass_overlaptail);
EVCNT_ATTACH_STATIC(tcp_reass_overlapfront);
EVCNT_ATTACH_STATIC(tcp_reass_segdup);
EVCNT_ATTACH_STATIC(tcp_reass_fragdup);
#endif /* TCP_REASS_COUNTERS */
#ifdef MBUFTRACE
struct mowner tcp_mowner = MOWNER_INIT("tcp", "");
struct mowner tcp_rx_mowner = MOWNER_INIT("tcp", "rx");
struct mowner tcp_tx_mowner = MOWNER_INIT("tcp", "tx");
2006-12-06 12:10:45 +03:00
struct mowner tcp_sock_mowner = MOWNER_INIT("tcp", "sock");
struct mowner tcp_sock_rx_mowner = MOWNER_INIT("tcp", "sock rx");
struct mowner tcp_sock_tx_mowner = MOWNER_INIT("tcp", "sock tx");
#endif
static int
do_tcpinit(void)
1993-03-21 12:45:37 +03:00
{
inpcb_init(&tcbtable, tcbhashsize, tcbhashsize);
2008-10-13 23:44:21 +04:00
pool_init(&tcpcb_pool, sizeof(struct tcpcb), 0, 0, 0, "tcpcbpl",
NULL, IPL_SOFTNET);
tcp_usrreq_init();
/* Initialize timer state. */
tcp_timer_init();
/* Initialize the compressed state engine. */
syn_cache_init();
/* Initialize the congestion control algorithms. */
tcp_congctl_init();
/* Initialize the TCPCB template. */
tcp_tcpcb_template();
/* Initialize reassembly queue */
tcpipqent_init();
2009-05-27 21:41:03 +04:00
/* SACK */
tcp_sack_init();
MOWNER_ATTACH(&tcp_tx_mowner);
MOWNER_ATTACH(&tcp_rx_mowner);
2006-12-06 12:10:45 +03:00
MOWNER_ATTACH(&tcp_reass_mowner);
MOWNER_ATTACH(&tcp_sock_mowner);
MOWNER_ATTACH(&tcp_sock_tx_mowner);
MOWNER_ATTACH(&tcp_sock_rx_mowner);
MOWNER_ATTACH(&tcp_mowner);
tcpstat_percpu = percpu_alloc(sizeof(uint64_t) * TCP_NSTATS);
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
vtw_earlyinit();
tcp_slowtimo_init();
return 0;
}
void
tcp_init_common(unsigned basehlen)
{
static ONCE_DECL(dotcpinit);
unsigned hlen = basehlen + sizeof(struct tcphdr);
unsigned oldhlen;
if (max_linkhdr + hlen > MHLEN)
panic("tcp_init");
while ((oldhlen = max_protohdr) < hlen)
atomic_cas_uint(&max_protohdr, oldhlen, hlen);
RUN_ONCE(&dotcpinit, do_tcpinit);
}
/*
* Tcp initialization
*/
void
tcp_init(void)
{
icmp_mtudisc_callback_register(tcp_mtudisc_callback);
tcp_init_common(sizeof(struct ip));
1993-03-21 12:45:37 +03:00
}
/*
* Create template to be used to send tcp packets on a connection.
* Call after host entry created, allocates an mbuf and fills
* in a skeletal tcp/ip header, minimizing the amount of work
* necessary when the connection is used.
*/
struct mbuf *
2005-02-04 02:50:33 +03:00
tcp_template(struct tcpcb *tp)
1993-03-21 12:45:37 +03:00
{
2000-03-30 16:51:13 +04:00
struct inpcb *inp = tp->t_inpcb;
struct tcphdr *n;
struct mbuf *m;
int hlen;
switch (tp->t_family) {
case AF_INET:
hlen = sizeof(struct ip);
if (inp->inp_af == AF_INET)
break;
#ifdef INET6
if (inp->inp_af == AF_INET6) {
/* mapped addr case */
if (IN6_IS_ADDR_V4MAPPED(&in6p_laddr(inp))
&& IN6_IS_ADDR_V4MAPPED(&in6p_faddr(inp)))
break;
}
#endif
return NULL; /*EINVAL*/
#ifdef INET6
case AF_INET6:
hlen = sizeof(struct ip6_hdr);
if (inp != NULL) {
/* more sainty check? */
break;
}
return NULL; /*EINVAL*/
#endif
default:
return NULL; /*EAFNOSUPPORT*/
}
KASSERT(hlen + sizeof(struct tcphdr) <= MCLBYTES);
m = tp->t_template;
if (m && m->m_len == hlen + sizeof(struct tcphdr)) {
;
} else {
if (m)
m_freem(m);
m = tp->t_template = NULL;
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m && hlen + sizeof(struct tcphdr) > MHLEN) {
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
m = NULL;
}
}
if (m == NULL)
return NULL;
MCLAIM(m, &tcp_mowner);
m->m_pkthdr.len = m->m_len = hlen + sizeof(struct tcphdr);
}
2009-03-18 19:00:08 +03:00
memset(mtod(m, void *), 0, m->m_len);
n = (struct tcphdr *)(mtod(m, char *) + hlen);
switch (tp->t_family) {
case AF_INET:
{
struct ipovly *ipov;
mtod(m, struct ip *)->ip_v = 4;
mtod(m, struct ip *)->ip_hl = hlen >> 2;
ipov = mtod(m, struct ipovly *);
ipov->ih_pr = IPPROTO_TCP;
ipov->ih_len = htons(sizeof(struct tcphdr));
if (inp->inp_af == AF_INET) {
ipov->ih_src = in4p_laddr(inp);
ipov->ih_dst = in4p_faddr(inp);
}
#ifdef INET6
else if (inp->inp_af == AF_INET6) {
/* mapped addr case */
bcopy(&in6p_laddr(inp).s6_addr32[3], &ipov->ih_src,
sizeof(ipov->ih_src));
bcopy(&in6p_faddr(inp).s6_addr32[3], &ipov->ih_dst,
sizeof(ipov->ih_dst));
}
#endif
/*
* Compute the pseudo-header portion of the checksum
* now. We incrementally add in the TCP option and
* payload lengths later, and then compute the TCP
* checksum right before the packet is sent off onto
* the wire.
*/
n->th_sum = in_cksum_phdr(ipov->ih_src.s_addr,
ipov->ih_dst.s_addr,
htons(sizeof(struct tcphdr) + IPPROTO_TCP));
break;
}
#ifdef INET6
case AF_INET6:
{
struct ip6_hdr *ip6;
mtod(m, struct ip *)->ip_v = 6;
ip6 = mtod(m, struct ip6_hdr *);
ip6->ip6_nxt = IPPROTO_TCP;
ip6->ip6_plen = htons(sizeof(struct tcphdr));
ip6->ip6_src = in6p_laddr(inp);
ip6->ip6_dst = in6p_faddr(inp);
ip6->ip6_flow = in6p_flowinfo(inp) & IPV6_FLOWINFO_MASK;
if (ip6_auto_flowlabel) {
ip6->ip6_flow &= ~IPV6_FLOWLABEL_MASK;
2002-06-09 20:33:36 +04:00
ip6->ip6_flow |=
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
}
ip6->ip6_vfc &= ~IPV6_VERSION_MASK;
ip6->ip6_vfc |= IPV6_VERSION;
/*
* Compute the pseudo-header portion of the checksum
* now. We incrementally add in the TCP option and
* payload lengths later, and then compute the TCP
* checksum right before the packet is sent off onto
* the wire.
*/
n->th_sum = in6_cksum_phdr(&in6p_laddr(inp),
&in6p_faddr(inp), htonl(sizeof(struct tcphdr)),
htonl(IPPROTO_TCP));
break;
}
#endif
}
n->th_sport = inp->inp_lport;
n->th_dport = inp->inp_fport;
n->th_seq = 0;
n->th_ack = 0;
n->th_x2 = 0;
n->th_off = 5;
n->th_flags = 0;
n->th_win = 0;
n->th_urp = 0;
return m;
1993-03-21 12:45:37 +03:00
}
/*
* Send a single message to the TCP at address specified by
* the given TCP/IP header. If m == 0, then we make a copy
* of the tcpiphdr at ti and send directly to the addressed host.
* This is used to force keep alive messages out using the TCP
* template for a connection tp->t_template. If flags are given
* then we send a message back to the TCP which originated the
* segment ti, and discard the mbuf containing it and any other
* attached mbufs.
*
* In any case the ack and sequence number of the transmitted
* segment are as specified by the parameters.
*/
int
tcp_respond(struct tcpcb *tp, struct mbuf *mtemplate, struct mbuf *m,
2005-02-04 02:50:33 +03:00
struct tcphdr *th0, tcp_seq ack, tcp_seq seq, int flags)
1993-03-21 12:45:37 +03:00
{
struct route *ro;
int error, tlen, win = 0;
int hlen;
struct ip *ip;
#ifdef INET6
struct ip6_hdr *ip6;
#endif
int family; /* family on packet, not inpcb! */
struct tcphdr *th;
if (tp != NULL && (flags & TH_RST) == 0) {
KASSERT(tp->t_inpcb != NULL);
win = sbspace(&tp->t_inpcb->inp_socket->so_rcv);
}
1993-03-21 12:45:37 +03:00
th = NULL; /* Quell uninitialized warning */
ip = NULL;
#ifdef INET6
ip6 = NULL;
#endif
if (m == NULL) {
if (!mtemplate)
return EINVAL;
/* get family information from template */
switch (mtod(mtemplate, struct ip *)->ip_v) {
case 4:
family = AF_INET;
hlen = sizeof(struct ip);
break;
#ifdef INET6
case 6:
family = AF_INET6;
hlen = sizeof(struct ip6_hdr);
break;
#endif
default:
return EAFNOSUPPORT;
}
MGETHDR(m, M_DONTWAIT, MT_HEADER);
if (m) {
MCLAIM(m, &tcp_tx_mowner);
MCLGET(m, M_DONTWAIT);
if ((m->m_flags & M_EXT) == 0) {
m_free(m);
m = NULL;
}
}
1993-03-21 12:45:37 +03:00
if (m == NULL)
return ENOBUFS;
2017-07-29 08:08:48 +03:00
tlen = 0;
1993-03-21 12:45:37 +03:00
m->m_data += max_linkhdr;
bcopy(mtod(mtemplate, void *), mtod(m, void *),
mtemplate->m_len);
switch (family) {
case AF_INET:
ip = mtod(m, struct ip *);
th = (struct tcphdr *)(ip + 1);
break;
#ifdef INET6
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
break;
#endif
}
1993-03-21 12:45:37 +03:00
flags = TH_ACK;
} else {
if ((m->m_flags & M_PKTHDR) == 0) {
m_freem(m);
return EINVAL;
}
KASSERT(th0 != NULL);
/* get family information from m */
switch (mtod(m, struct ip *)->ip_v) {
case 4:
family = AF_INET;
hlen = sizeof(struct ip);
ip = mtod(m, struct ip *);
break;
#ifdef INET6
case 6:
family = AF_INET6;
hlen = sizeof(struct ip6_hdr);
ip6 = mtod(m, struct ip6_hdr *);
break;
#endif
default:
m_freem(m);
return EAFNOSUPPORT;
}
/* clear h/w csum flags inherited from rx packet */
m->m_pkthdr.csum_flags = 0;
if ((flags & TH_SYN) == 0 || sizeof(*th0) > (th0->th_off << 2))
tlen = sizeof(*th0);
else
tlen = th0->th_off << 2;
if (m->m_len > hlen + tlen && (m->m_flags & M_EXT) == 0 &&
mtod(m, char *) + hlen == (char *)th0) {
m->m_len = hlen + tlen;
m_freem(m->m_next);
m->m_next = NULL;
} else {
struct mbuf *n;
KASSERT(max_linkhdr + hlen + tlen <= MCLBYTES);
MGETHDR(n, M_DONTWAIT, MT_HEADER);
if (n && max_linkhdr + hlen + tlen > MHLEN) {
MCLGET(n, M_DONTWAIT);
if ((n->m_flags & M_EXT) == 0) {
m_freem(n);
n = NULL;
}
}
if (!n) {
m_freem(m);
return ENOBUFS;
}
MCLAIM(n, &tcp_tx_mowner);
n->m_data += max_linkhdr;
n->m_len = hlen + tlen;
m_copyback(n, 0, hlen, mtod(m, void *));
m_copyback(n, hlen, tlen, (void *)th0);
m_freem(m);
m = n;
n = NULL;
}
#define xchg(a,b,type) { type t; t=a; a=b; b=t; }
switch (family) {
case AF_INET:
ip = mtod(m, struct ip *);
th = (struct tcphdr *)(ip + 1);
ip->ip_p = IPPROTO_TCP;
xchg(ip->ip_dst, ip->ip_src, struct in_addr);
ip->ip_p = IPPROTO_TCP;
break;
#ifdef INET6
case AF_INET6:
ip6 = mtod(m, struct ip6_hdr *);
th = (struct tcphdr *)(ip6 + 1);
ip6->ip6_nxt = IPPROTO_TCP;
xchg(ip6->ip6_dst, ip6->ip6_src, struct in6_addr);
ip6->ip6_nxt = IPPROTO_TCP;
break;
#endif
}
xchg(th->th_dport, th->th_sport, u_int16_t);
1993-03-21 12:45:37 +03:00
#undef xchg
tlen = 0; /*be friendly with the following code*/
1993-03-21 12:45:37 +03:00
}
th->th_seq = htonl(seq);
th->th_ack = htonl(ack);
th->th_x2 = 0;
if ((flags & TH_SYN) == 0) {
if (tp)
win >>= tp->rcv_scale;
if (win > TCP_MAXWIN)
win = TCP_MAXWIN;
th->th_win = htons((u_int16_t)win);
th->th_off = sizeof (struct tcphdr) >> 2;
tlen += sizeof(*th);
} else {
tlen += th->th_off << 2;
}
m->m_len = hlen + tlen;
m->m_pkthdr.len = hlen + tlen;
m_reset_rcvif(m);
th->th_flags = flags;
th->th_urp = 0;
switch (family) {
case AF_INET:
{
struct ipovly *ipov = (struct ipovly *)ip;
2009-03-18 19:00:08 +03:00
memset(ipov->ih_x1, 0, sizeof ipov->ih_x1);
ipov->ih_len = htons((u_int16_t)tlen);
th->th_sum = 0;
th->th_sum = in_cksum(m, hlen + tlen);
ip->ip_len = htons(hlen + tlen);
ip->ip_ttl = ip_defttl;
break;
}
#ifdef INET6
case AF_INET6:
{
th->th_sum = 0;
th->th_sum = in6_cksum(m, IPPROTO_TCP, sizeof(struct ip6_hdr),
tlen);
ip6->ip6_plen = htons(tlen);
if (tp && tp->t_inpcb->inp_af == AF_INET6)
2022-11-04 12:01:53 +03:00
ip6->ip6_hlim = in6pcb_selecthlim_rt(tp->t_inpcb);
else
ip6->ip6_hlim = ip6_defhlim;
ip6->ip6_flow &= ~IPV6_FLOWINFO_MASK;
if (ip6_auto_flowlabel) {
2002-06-09 20:33:36 +04:00
ip6->ip6_flow |=
(htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK);
}
break;
}
#endif
}
if (tp != NULL && tp->t_inpcb->inp_af == AF_INET) {
ro = &tp->t_inpcb->inp_route;
KASSERT(family == AF_INET);
KASSERT(in_hosteq(ip->ip_dst, in4p_faddr(tp->t_inpcb)));
}
#ifdef INET6
else if (tp != NULL && tp->t_inpcb->inp_af == AF_INET6) {
ro = (struct route *)&tp->t_inpcb->inp_route;
#ifdef DIAGNOSTIC
if (family == AF_INET) {
if (!IN6_IS_ADDR_V4MAPPED(&in6p_faddr(tp->t_inpcb)))
panic("tcp_respond: not mapped addr");
2009-03-18 18:14:29 +03:00
if (memcmp(&ip->ip_dst,
&in6p_faddr(tp->t_inpcb).s6_addr32[3],
2002-09-25 15:19:23 +04:00
sizeof(ip->ip_dst)) != 0) {
panic("tcp_respond: ip_dst != in6p_faddr");
}
} else if (family == AF_INET6) {
2002-09-25 15:19:23 +04:00
if (!IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst,
&in6p_faddr(tp->t_inpcb)))
panic("tcp_respond: ip6_dst != in6p_faddr");
} else
panic("tcp_respond: address family mismatch");
#endif
}
#endif
else
ro = NULL;
switch (family) {
case AF_INET:
error = ip_output(m, NULL, ro,
(tp && tp->t_mtudisc ? IP_MTUDISC : 0), NULL,
tp ? tp->t_inpcb : NULL);
break;
#ifdef INET6
case AF_INET6:
error = ip6_output(m, NULL, ro, 0, NULL,
tp ? tp->t_inpcb : NULL, NULL);
break;
#endif
default:
error = EAFNOSUPPORT;
break;
}
return error;
1993-03-21 12:45:37 +03:00
}
/*
* Template TCPCB. Rather than zeroing a new TCPCB and initializing
* a bunch of members individually, we maintain this template for the
* static and mostly-static components of the TCPCB, and copy it into
* the new TCPCB instead.
1993-03-21 12:45:37 +03:00
*/
static struct tcpcb tcpcb_template = {
.t_srtt = TCPTV_SRTTBASE,
.t_rttmin = TCPTV_MIN,
.snd_cwnd = TCP_MAXWIN << TCP_MAX_WINSHIFT,
.snd_ssthresh = TCP_MAXWIN << TCP_MAX_WINSHIFT,
.snd_numholes = 0,
.snd_cubic_wmax = 0,
.snd_cubic_wmax_last = 0,
.snd_cubic_ctime = 0,
.t_partialacks = -1,
.t_bytes_acked = 0,
.t_sndrexmitpack = 0,
.t_rcvoopack = 0,
.t_sndzerowin = 0,
};
/*
* Updates the TCPCB template whenever a parameter that would affect
* the template is changed.
*/
void
tcp_tcpcb_template(void)
{
struct tcpcb *tp = &tcpcb_template;
int flags;
tp->t_peermss = tcp_mssdflt;
tp->t_ourmss = tcp_mssdflt;
tp->t_segsz = tcp_mssdflt;
flags = 0;
if (tcp_do_rfc1323 && tcp_do_win_scale)
flags |= TF_REQ_SCALE;
if (tcp_do_rfc1323 && tcp_do_timestamps)
flags |= TF_REQ_TSTMP;
tp->t_flags = flags;
1993-03-21 12:45:37 +03:00
/*
* Init srtt to TCPTV_SRTTBASE (0), so we can tell that we have no
* rtt estimate. Set rttvar so that srtt + 2 * rttvar gives
* reasonable initial retransmit time.
*/
tp->t_rttvar = tcp_rttdflt * PR_SLOWHZ << (TCP_RTTVAR_SHIFT + 2 - 1);
TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp),
1993-03-21 12:45:37 +03:00
TCPTV_MIN, TCPTV_REXMTMAX);
/* Keep Alive */
tp->t_keepinit = MIN(tcp_keepinit, TCP_TIMER_MAXTICKS);
tp->t_keepidle = MIN(tcp_keepidle, TCP_TIMER_MAXTICKS);
tp->t_keepintvl = MIN(tcp_keepintvl, TCP_TIMER_MAXTICKS);
tp->t_keepcnt = MAX(1, MIN(tcp_keepcnt, TCP_TIMER_MAXTICKS));
tp->t_maxidle = tp->t_keepcnt * MIN(tp->t_keepintvl,
TCP_TIMER_MAXTICKS/tp->t_keepcnt);
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
/* MSL */
tp->t_msl = TCPTV_MSL;
}
/*
* Create a new TCP control block, making an
* empty reassembly queue and hooking it to the argument
* protocol control block.
*/
struct tcpcb *
tcp_newtcpcb(int family, struct inpcb *inp)
{
struct tcpcb *tp;
int i;
/* XXX Consider using a pool_cache for speed. */
tp = pool_get(&tcpcb_pool, PR_NOWAIT); /* splsoftnet via tcp_usrreq */
if (tp == NULL)
return NULL;
memcpy(tp, &tcpcb_template, sizeof(*tp));
TAILQ_INIT(&tp->segq);
TAILQ_INIT(&tp->timeq);
tp->t_family = family; /* may be overridden later on */
TAILQ_INIT(&tp->snd_holes);
LIST_INIT(&tp->t_sc); /* XXX can template this */
/* Don't sweat this loop; hopefully the compiler will unroll it. */
for (i = 0; i < TCPT_NTIMERS; i++) {
callout_init(&tp->t_timer[i], CALLOUT_MPSAFE);
TCP_TIMER_INIT(tp, i);
}
callout_init(&tp->t_delack_ch, CALLOUT_MPSAFE);
switch (family) {
case AF_INET:
in4p_ip(inp).ip_ttl = ip_defttl;
inp->inp_ppcb = (void *)tp;
tp->t_inpcb = inp;
tp->t_mtudisc = ip_mtudisc;
break;
#ifdef INET6
case AF_INET6:
2022-11-04 12:01:53 +03:00
in6p_ip6(inp).ip6_hlim = in6pcb_selecthlim_rt(inp);
inp->inp_ppcb = (void *)tp;
tp->t_inpcb = inp;
/* for IPv6, always try to run path MTU discovery */
tp->t_mtudisc = 1;
break;
#endif /* INET6 */
default:
for (i = 0; i < TCPT_NTIMERS; i++)
callout_destroy(&tp->t_timer[i]);
callout_destroy(&tp->t_delack_ch);
pool_put(&tcpcb_pool, tp); /* splsoftnet via tcp_usrreq */
return NULL;
}
/*
* Initialize our timebase. When we send timestamps, we take
* the delta from tcp_now -- this means each connection always
* gets a timebase of 1, which makes it, among other things,
* more difficult to determine how long a system has been up,
* and thus how many TCP sequence increments have occurred.
*
* We start with 1, because 0 doesn't work with linux, which
* considers timestamp 0 in a SYN packet as a bug and disables
* timestamps.
*/
tp->ts_timebase = tcp_now - 1;
tcp_congctl_select(tp, tcp_congctl_global_name);
return tp;
1993-03-21 12:45:37 +03:00
}
/*
* Drop a TCP connection, reporting
* the specified error. If connection is synchronized,
* then send a RST to peer.
*/
struct tcpcb *
2005-02-04 02:50:33 +03:00
tcp_drop(struct tcpcb *tp, int errno)
1993-03-21 12:45:37 +03:00
{
struct socket *so;
KASSERT(tp->t_inpcb != NULL);
so = tp->t_inpcb->inp_socket;
if (so == NULL)
2000-10-29 09:33:59 +03:00
return NULL;
1993-03-21 12:45:37 +03:00
if (TCPS_HAVERCVDSYN(tp->t_state)) {
tp->t_state = TCPS_CLOSED;
(void) tcp_output(tp);
TCP_STATINC(TCP_STAT_DROPS);
1993-03-21 12:45:37 +03:00
} else
TCP_STATINC(TCP_STAT_CONNDROPS);
1993-03-21 12:45:37 +03:00
if (errno == ETIMEDOUT && tp->t_softerror)
errno = tp->t_softerror;
so->so_error = errno;
return (tcp_close(tp));
}
/*
* Close a TCP control block:
* discard all space held by the tcp
* discard internet protocol block
* wake up any sleepers
*/
struct tcpcb *
2005-02-04 02:50:33 +03:00
tcp_close(struct tcpcb *tp)
1993-03-21 12:45:37 +03:00
{
struct inpcb *inp;
struct socket *so;
1993-03-21 12:45:37 +03:00
#ifdef RTV_RTT
struct rtentry *rt = NULL;
#endif
struct route *ro;
int j;
1993-03-21 12:45:37 +03:00
inp = tp->t_inpcb;
so = inp->inp_socket;
ro = &inp->inp_route;
#ifdef RTV_RTT
1993-03-21 12:45:37 +03:00
/*
* If we sent enough data to get some meaningful characteristics,
2002-06-09 20:33:36 +04:00
* save them in the routing entry. 'Enough' is arbitrarily
1993-03-21 12:45:37 +03:00
* defined as the sendpipesize (default 4K) * 16. This would
* give us 16 rtt samples assuming we only get one sample per
* window (the usual case on a long haul net). 16 samples is
* enough for the srtt filter to converge to within 5% of the correct
* value; fewer samples and we could save a very bogus rtt.
*
* Don't update the default route's characteristics and don't
* update anything that the user "locked".
*/
if (SEQ_LT(tp->iss + so->so_snd.sb_hiwat * 16, tp->snd_max) &&
ro && (rt = rtcache_validate(ro)) != NULL &&
Take steps to hide the radix_node implementation of the forwarding table from the forwarding table's users: Introduce rt_walktree() for walking the routing table and applying a function to each rtentry. Replace most rn_walktree() calls with it. Use rt_getkey()/rt_setkey() to get/set a route's destination. Keep a pointer to the sockaddr key in the rtentry, so that rtentry users do not have to grovel in the radix_node for the key. Add a RTM_GET method to rtrequest. Use that instead of radix_node lookups in, e.g., carp(4). Add sys/net/link_proto.c, which supplies sockaddr routines for link-layer socket addresses (sockaddr_dl). Cosmetic: Constify. KNF. Stop open-coding LIST_FOREACH, TAILQ_FOREACH, et cetera. Use NULL instead of 0 for null pointers. Use __arraycount(). Reduce gratuitous parenthesization. Stop using variadic arguments for rip6_output(), it is unnecessary. Remove the unnecessary rtentry member rt_genmask and the code to maintain it, since nothing actually used it. Make rt_maskedcopy() easier to read by using meaningful variable names. Extract a subroutine intern_netmask() for looking up a netmask in the masks table. Start converting backslash-ridden IPv6 macros in sys/netinet6/in6_var.h into inline subroutines that one can read without special eyeglasses. One functional change: when the kernel serves an RTM_GET, RTM_LOCK, or RTM_CHANGE request, it applies the netmask (if supplied) to a destination before searching for it in the forwarding table. I have changed sys/netinet/ip_carp.c, carp_setroute(), to remove the unlawful radix_node knowledge. Apart from the changes to carp(4), netiso, ATM, and strip(4), I have run the changes on three nodes in my wireless routing testbed, which involves IPv4 + IPv6 dynamic routing acrobatics, and it's working beautifully so far.
2007-07-20 00:48:52 +04:00
!in_nullhost(satocsin(rt_getkey(rt))->sin_addr)) {
2000-03-30 16:51:13 +04:00
u_long i = 0;
1993-03-21 12:45:37 +03:00
if ((rt->rt_rmx.rmx_locks & RTV_RTT) == 0) {
i = tp->t_srtt *
((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
1993-03-21 12:45:37 +03:00
if (rt->rt_rmx.rmx_rtt && i)
/*
* filter this update to half the old & half
* the new values, converting scale.
* See route.h and tcp_var.h for a
* description of the scaling constants.
*/
rt->rt_rmx.rmx_rtt =
(rt->rt_rmx.rmx_rtt + i) / 2;
else
rt->rt_rmx.rmx_rtt = i;
}
if ((rt->rt_rmx.rmx_locks & RTV_RTTVAR) == 0) {
i = tp->t_rttvar *
((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTTVAR_SHIFT + 2));
1993-03-21 12:45:37 +03:00
if (rt->rt_rmx.rmx_rttvar && i)
rt->rt_rmx.rmx_rttvar =
(rt->rt_rmx.rmx_rttvar + i) / 2;
else
rt->rt_rmx.rmx_rttvar = i;
}
/*
* update the pipelimit (ssthresh) if it has been updated
2021-07-31 23:29:36 +03:00
* already or if a pipesize was specified & the threshold
1993-03-21 12:45:37 +03:00
* got below half the pipesize. I.e., wait for bad news
* before we start updating, then update on both good
* and bad news.
*/
1996-02-14 02:40:59 +03:00
if (((rt->rt_rmx.rmx_locks & RTV_SSTHRESH) == 0 &&
(i = tp->snd_ssthresh) && rt->rt_rmx.rmx_ssthresh) ||
1993-03-21 12:45:37 +03:00
i < (rt->rt_rmx.rmx_sendpipe / 2)) {
/*
* convert the limit from user data bytes to
* packets then to packet data bytes.
*/
i = (i + tp->t_segsz / 2) / tp->t_segsz;
1993-03-21 12:45:37 +03:00
if (i < 2)
i = 2;
i *= (u_long)(tp->t_segsz + sizeof (struct tcpiphdr));
1993-03-21 12:45:37 +03:00
if (rt->rt_rmx.rmx_ssthresh)
rt->rt_rmx.rmx_ssthresh =
(rt->rt_rmx.rmx_ssthresh + i) / 2;
else
rt->rt_rmx.rmx_ssthresh = i;
}
}
rtcache_unref(rt, ro);
#endif /* RTV_RTT */
1993-03-21 12:45:37 +03:00
/* free the reassembly queue, if any */
TCP_REASS_LOCK(tp);
1997-12-10 04:58:07 +03:00
(void) tcp_freeq(tp);
TCP_REASS_UNLOCK(tp);
/* free the SACK holes list. */
tcp_free_sackholes(tp);
tcp_congctl_release(tp);
syn_cache_cleanup(tp);
1997-12-10 04:58:07 +03:00
if (tp->t_template) {
m_free(tp->t_template);
tp->t_template = NULL;
}
/*
* Detaching the pcb will unlock the socket/tcpcb, and stopping
* the timers can also drop the lock. We need to prevent access
* to the tcpcb as it's half torn down. Flag the pcb as dead
* (prevents access by timers) and only then detach it.
*/
tp->t_flags |= TF_DEAD;
inp->inp_ppcb = NULL;
soisdisconnected(so);
inpcb_destroy(inp);
/*
* pcb is no longer visble elsewhere, so we can safely release
* the lock in callout_halt() if needed.
*/
TCP_STATINC(TCP_STAT_CLOSED);
for (j = 0; j < TCPT_NTIMERS; j++) {
callout_halt(&tp->t_timer[j], softnet_lock);
callout_destroy(&tp->t_timer[j]);
}
callout_halt(&tp->t_delack_ch, softnet_lock);
callout_destroy(&tp->t_delack_ch);
pool_put(&tcpcb_pool, tp);
return NULL;
1993-03-21 12:45:37 +03:00
}
1997-12-10 04:58:07 +03:00
int
tcp_freeq(struct tcpcb *tp)
1997-12-10 04:58:07 +03:00
{
2000-03-30 16:51:13 +04:00
struct ipqent *qe;
1997-12-10 04:58:07 +03:00
int rv = 0;
TCP_REASS_LOCK_CHECK(tp);
while ((qe = TAILQ_FIRST(&tp->segq)) != NULL) {
TAILQ_REMOVE(&tp->segq, qe, ipqe_q);
TAILQ_REMOVE(&tp->timeq, qe, ipqe_timeq);
1997-12-10 04:58:07 +03:00
m_freem(qe->ipqe_m);
2005-03-30 00:10:16 +04:00
tcpipqent_free(qe);
1997-12-10 04:58:07 +03:00
rv = 1;
}
tp->t_segqlen = 0;
KASSERT(TAILQ_EMPTY(&tp->timeq));
1997-12-10 04:58:07 +03:00
return (rv);
}
void
tcp_fasttimo(void)
{
if (tcp_drainwanted) {
tcp_drain();
tcp_drainwanted = 0;
}
}
void
tcp_drainstub(void)
{
tcp_drainwanted = 1;
}
1997-12-10 04:58:07 +03:00
/*
* Protocol drain routine. Called when memory is in short supply.
* Called from pr_fasttimo thus a callout context.
1997-12-10 04:58:07 +03:00
*/
1994-01-09 02:07:16 +03:00
void
2005-02-04 02:50:33 +03:00
tcp_drain(void)
1993-03-21 12:45:37 +03:00
{
struct inpcb *inp;
2000-03-30 16:51:13 +04:00
struct tcpcb *tp;
1993-03-21 12:45:37 +03:00
mutex_enter(softnet_lock);
KERNEL_LOCK(1, NULL);
1997-12-10 04:58:07 +03:00
/*
* Free the sequence queue of all TCP connections.
*/
TAILQ_FOREACH(inp, &tcbtable.inpt_queue, inp_queue) {
tp = intotcpcb(inp);
if (tp != NULL) {
2002-03-15 12:25:41 +03:00
/*
* If the tcpcb is already busy,
2002-03-15 12:25:41 +03:00
* just bail out now.
*/
if (tcp_reass_lock_try(tp) == 0)
continue;
if (tcp_freeq(tp))
TCP_STATINC(TCP_STAT_CONNSDRAINED);
2002-03-15 12:25:41 +03:00
TCP_REASS_UNLOCK(tp);
}
}
KERNEL_UNLOCK_ONE(NULL);
mutex_exit(softnet_lock);
2002-03-15 12:25:41 +03:00
}
1993-03-21 12:45:37 +03:00
/*
* Notify a tcp user of an asynchronous error;
* store error as soft error, but wake up user
* (for now, won't do anything until can select for soft error).
*/
1994-01-09 02:07:16 +03:00
void
2005-02-04 02:50:33 +03:00
tcp_notify(struct inpcb *inp, int error)
1993-03-21 12:45:37 +03:00
{
2000-03-30 16:51:13 +04:00
struct tcpcb *tp = (struct tcpcb *)inp->inp_ppcb;
struct socket *so = inp->inp_socket;
1993-03-21 12:45:37 +03:00
/*
* Ignore some errors if we are hooked up.
* If connection hasn't completed, has retransmitted several times,
* and receives a second error, give up now. This is better
* than waiting a long time to establish a connection that
* can never complete.
*/
if (tp->t_state == TCPS_ESTABLISHED &&
(error == EHOSTUNREACH || error == ENETUNREACH ||
error == EHOSTDOWN)) {
return;
} else if (TCPS_HAVEESTABLISHED(tp->t_state) == 0 &&
tp->t_rxtshift > 3 && tp->t_softerror)
so->so_error = error;
2002-06-09 20:33:36 +04:00
else
tp->t_softerror = error;
cv_broadcast(&so->so_cv);
sorwakeup(so);
sowwakeup(so);
1993-03-21 12:45:37 +03:00
}
#ifdef INET6
void *
tcp6_ctlinput(int cmd, const struct sockaddr *sa, void *d)
{
struct tcphdr th;
void (*notify)(struct inpcb *, int) = tcp_notify;
int nmatch;
2000-03-30 16:51:13 +04:00
struct ip6_hdr *ip6;
const struct sockaddr_in6 *sa6_src = NULL;
const struct sockaddr_in6 *sa6 = (const struct sockaddr_in6 *)sa;
struct mbuf *m;
int off;
if (sa->sa_family != AF_INET6 ||
sa->sa_len != sizeof(struct sockaddr_in6))
return NULL;
if ((unsigned)cmd >= PRC_NCMDS)
return NULL;
else if (cmd == PRC_QUENCH) {
/*
* Don't honor ICMP Source Quench messages meant for
* TCP connections.
*/
return NULL;
} else if (PRC_IS_REDIRECT(cmd))
2022-11-04 12:01:53 +03:00
notify = in6pcb_rtchange, d = NULL;
else if (cmd == PRC_MSGSIZE)
; /* special code is present, see below */
else if (cmd == PRC_HOSTDEAD)
d = NULL;
else if (inet6ctlerrmap[cmd] == 0)
return NULL;
/* if the parameter is from icmp6, decode it. */
if (d != NULL) {
struct ip6ctlparam *ip6cp = (struct ip6ctlparam *)d;
m = ip6cp->ip6c_m;
ip6 = ip6cp->ip6c_ip6;
off = ip6cp->ip6c_off;
sa6_src = ip6cp->ip6c_src;
} else {
m = NULL;
ip6 = NULL;
sa6_src = &sa6_any;
2003-10-25 12:13:28 +04:00
off = 0;
}
if (ip6) {
/* check if we can safely examine src and dst ports */
if (m->m_pkthdr.len < off + sizeof(th)) {
if (cmd == PRC_MSGSIZE)
icmp6_mtudisc_update((struct ip6ctlparam *)d, 0);
return NULL;
}
2009-03-18 19:00:08 +03:00
memset(&th, 0, sizeof(th));
m_copydata(m, off, sizeof(th), (void *)&th);
if (cmd == PRC_MSGSIZE) {
int valid = 0;
/*
* Check to see if we have a valid TCP connection
* corresponding to the address in the ICMPv6 message
* payload.
*/
2022-11-04 12:01:53 +03:00
if (in6pcb_lookup(&tcbtable, &sa6->sin6_addr,
th.th_dport,
(const struct in6_addr *)&sa6_src->sin6_addr,
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
th.th_sport, 0, 0))
valid++;
/*
* Depending on the value of "valid" and routing table
* size (mtudisc_{hi,lo}wat), we will:
* - recalcurate the new MTU and create the
* corresponding routing entry, or
* - ignore the MTU change notification.
*/
icmp6_mtudisc_update((struct ip6ctlparam *)d, valid);
/*
2022-11-04 12:01:53 +03:00
* no need to call in6pcb_notify, it should have been
* called via callback if necessary
*/
return NULL;
}
2022-11-04 12:01:53 +03:00
nmatch = in6pcb_notify(&tcbtable, sa, th.th_dport,
(const struct sockaddr *)sa6_src, th.th_sport, cmd, NULL, notify);
if (nmatch == 0 && syn_cache_count &&
(inet6ctlerrmap[cmd] == EHOSTUNREACH ||
inet6ctlerrmap[cmd] == ENETUNREACH ||
inet6ctlerrmap[cmd] == EHOSTDOWN))
syn_cache_unreach((const struct sockaddr *)sa6_src,
sa, &th);
} else {
2022-11-04 12:01:53 +03:00
(void) in6pcb_notify(&tcbtable, sa, 0,
(const struct sockaddr *)sa6_src, 0, cmd, NULL, notify);
}
return NULL;
}
#endif
/* assumes that ip header and tcp header are contiguous on mbuf */
1996-02-14 02:40:59 +03:00
void *
tcp_ctlinput(int cmd, const struct sockaddr *sa, void *v)
1993-03-21 12:45:37 +03:00
{
2000-03-30 16:51:13 +04:00
struct ip *ip = v;
struct tcphdr *th;
struct icmp *icp;
extern const int inetctlerrmap[];
2005-02-03 00:41:55 +03:00
void (*notify)(struct inpcb *, int) = tcp_notify;
1995-06-12 10:48:54 +04:00
int errno;
int nmatch;
struct tcpcb *tp;
u_int mtu;
tcp_seq seq;
struct inpcb *inp;
#ifdef INET6
struct in6_addr src6, dst6;
#endif
1993-03-21 12:45:37 +03:00
if (sa->sa_family != AF_INET ||
sa->sa_len != sizeof(struct sockaddr_in))
return NULL;
if ((unsigned)cmd >= PRC_NCMDS)
1996-02-14 02:40:59 +03:00
return NULL;
errno = inetctlerrmap[cmd];
1995-06-12 10:24:21 +04:00
if (cmd == PRC_QUENCH)
/*
* Don't honor ICMP Source Quench messages meant for
* TCP connections.
*/
return NULL;
1995-06-12 10:24:21 +04:00
else if (PRC_IS_REDIRECT(cmd))
notify = inpcb_rtchange, ip = 0;
else if (cmd == PRC_MSGSIZE && ip && ip->ip_v == 4) {
/*
* Check to see if we have a valid TCP connection
* corresponding to the address in the ICMP message
* payload.
*
* Boundary check is made in icmp_input(), with ICMP_ADVLENMIN.
*/
th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
#ifdef INET6
in6_in_2_v4mapin6(&ip->ip_src, &src6);
in6_in_2_v4mapin6(&ip->ip_dst, &dst6);
#endif
if ((inp = inpcb_lookup(&tcbtable, ip->ip_dst,
th->th_dport, ip->ip_src, th->th_sport, 0)) != NULL)
2005-07-20 12:05:43 +04:00
;
#ifdef INET6
2022-11-04 12:01:53 +03:00
else if ((inp = in6pcb_lookup(&tcbtable, &dst6,
th->th_dport, &src6, th->th_sport, 0, 0)) != NULL)
;
#endif
else
return NULL;
/*
* Now that we've validated that we are actually communicating
* with the host indicated in the ICMP message, locate the
* ICMP header, recalculate the new MTU, and create the
* corresponding routing entry.
*/
icp = (struct icmp *)((char *)ip -
offsetof(struct icmp, icmp_ip));
tp = intotcpcb(inp);
if (tp == NULL)
return NULL;
seq = ntohl(th->th_seq);
if (SEQ_LT(seq, tp->snd_una) || SEQ_GT(seq, tp->snd_max))
return NULL;
/*
* If the ICMP message advertises a Next-Hop MTU
* equal or larger than the maximum packet size we have
* ever sent, drop the message.
*/
mtu = (u_int)ntohs(icp->icmp_nextmtu);
if (mtu >= tp->t_pmtud_mtu_sent)
return NULL;
if (mtu >= tcp_hdrsz(tp) + tp->t_pmtud_mss_acked) {
/*
* Calculate new MTU, and create corresponding
* route (traditional PMTUD).
*/
tp->t_flags &= ~TF_PMTUD_PEND;
icmp_mtudisc(icp, ip->ip_dst);
} else {
/*
* Record the information got in the ICMP
* message; act on it later.
* If we had already recorded an ICMP message,
* replace the old one only if the new message
* refers to an older TCP segment
*/
if (tp->t_flags & TF_PMTUD_PEND) {
if (SEQ_LT(tp->t_pmtud_th_seq, seq))
return NULL;
} else
tp->t_flags |= TF_PMTUD_PEND;
tp->t_pmtud_th_seq = seq;
tp->t_pmtud_nextmtu = icp->icmp_nextmtu;
tp->t_pmtud_ip_len = icp->icmp_ip.ip_len;
tp->t_pmtud_ip_hl = icp->icmp_ip.ip_hl;
}
return NULL;
} else if (cmd == PRC_HOSTDEAD)
1995-06-12 10:24:21 +04:00
ip = 0;
else if (errno == 0)
1996-02-14 02:40:59 +03:00
return NULL;
if (ip && ip->ip_v == 4 && sa->sa_family == AF_INET) {
th = (struct tcphdr *)((char *)ip + (ip->ip_hl << 2));
nmatch = inpcb_notify(&tcbtable, satocsin(sa)->sin_addr,
th->th_dport, ip->ip_src, th->th_sport, errno, notify);
if (nmatch == 0 && syn_cache_count &&
(inetctlerrmap[cmd] == EHOSTUNREACH ||
inetctlerrmap[cmd] == ENETUNREACH ||
inetctlerrmap[cmd] == EHOSTDOWN)) {
struct sockaddr_in sin;
2009-03-18 19:00:08 +03:00
memset(&sin, 0, sizeof(sin));
sin.sin_len = sizeof(sin);
sin.sin_family = AF_INET;
sin.sin_port = th->th_sport;
sin.sin_addr = ip->ip_src;
syn_cache_unreach((struct sockaddr *)&sin, sa, th);
}
/* XXX mapped address case */
} else
inpcb_notifyall(&tcbtable, satocsin(sa)->sin_addr, errno,
notify);
1996-02-14 02:40:59 +03:00
return NULL;
1993-03-21 12:45:37 +03:00
}
/*
2005-03-09 07:24:12 +03:00
* When a source quench is received, we are being notified of congestion.
* Close the congestion window down to the Loss Window (one segment).
* We will gradually open it again as we proceed.
1993-03-21 12:45:37 +03:00
*/
1994-01-09 02:07:16 +03:00
void
2018-12-27 19:59:17 +03:00
tcp_quench(struct inpcb *inp)
1993-03-21 12:45:37 +03:00
{
struct tcpcb *tp = intotcpcb(inp);
if (tp) {
tp->snd_cwnd = tp->t_segsz;
tp->t_bytes_acked = 0;
}
1993-03-21 12:45:37 +03:00
}
/*
* Path MTU Discovery handlers.
*/
void
2005-02-04 02:50:33 +03:00
tcp_mtudisc_callback(struct in_addr faddr)
{
#ifdef INET6
struct in6_addr in6;
#endif
inpcb_notifyall(&tcbtable, faddr, EMSGSIZE, tcp_mtudisc);
#ifdef INET6
in6_in_2_v4mapin6(&faddr, &in6);
tcp6_mtudisc_callback(&in6);
#endif
}
/*
* On receipt of path MTU corrections, flush old route and replace it
* with the new one. Retransmit all unacknowledged packets, to ensure
* that all packets will be received.
*/
void
2005-02-04 02:50:33 +03:00
tcp_mtudisc(struct inpcb *inp, int errno)
{
struct tcpcb *tp = intotcpcb(inp);
struct rtentry *rt;
if (tp == NULL)
return;
rt = inpcb_rtentry(inp);
if (rt != NULL) {
/*
* If this was not a host route, remove and realloc.
*/
if ((rt->rt_flags & RTF_HOST) == 0) {
inpcb_rtentry_unref(rt, inp);
inpcb_rtchange(inp, errno);
if ((rt = inpcb_rtentry(inp)) == NULL)
return;
}
2002-06-09 20:33:36 +04:00
/*
* Slow start out of the error condition. We
* use the MTU because we know it's smaller
* than the previously transmitted segment.
*
* Note: This is more conservative than the
* suggestion in draft-floyd-incr-init-win-03.
*/
if (rt->rt_rmx.rmx_mtu != 0)
tp->snd_cwnd =
TCP_INITIAL_WINDOW(tcp_init_win,
rt->rt_rmx.rmx_mtu);
inpcb_rtentry_unref(rt, inp);
}
/*
* Resend unacknowledged packets.
*/
tp->snd_nxt = tp->sack_newdata = tp->snd_una;
tcp_output(tp);
}
#ifdef INET6
/*
* Path MTU Discovery handlers.
*/
void
2005-02-04 02:50:33 +03:00
tcp6_mtudisc_callback(struct in6_addr *faddr)
{
struct sockaddr_in6 sin6;
2009-03-18 19:00:08 +03:00
memset(&sin6, 0, sizeof(sin6));
sin6.sin6_family = AF_INET6;
sin6.sin6_len = sizeof(struct sockaddr_in6);
sin6.sin6_addr = *faddr;
2022-11-04 12:01:53 +03:00
(void) in6pcb_notify(&tcbtable, (struct sockaddr *)&sin6, 0,
(const struct sockaddr *)&sa6_any, 0, PRC_MSGSIZE, NULL, tcp6_mtudisc);
}
void
tcp6_mtudisc(struct inpcb *inp, int errno)
{
struct tcpcb *tp = intotcpcb(inp);
struct rtentry *rt;
if (tp == NULL)
return;
2022-11-04 12:01:53 +03:00
rt = in6pcb_rtentry(inp);
if (rt != NULL) {
/*
* If this was not a host route, remove and realloc.
*/
if ((rt->rt_flags & RTF_HOST) == 0) {
2022-11-04 12:01:53 +03:00
in6pcb_rtentry_unref(rt, inp);
in6pcb_rtchange(inp, errno);
rt = in6pcb_rtentry(inp);
if (rt == NULL)
return;
}
/*
* Slow start out of the error condition. We
* use the MTU because we know it's smaller
* than the previously transmitted segment.
*
* Note: This is more conservative than the
* suggestion in draft-floyd-incr-init-win-03.
*/
if (rt->rt_rmx.rmx_mtu != 0) {
tp->snd_cwnd = TCP_INITIAL_WINDOW(tcp_init_win,
rt->rt_rmx.rmx_mtu);
}
2022-11-04 12:01:53 +03:00
in6pcb_rtentry_unref(rt, inp);
}
/*
* Resend unacknowledged packets.
*/
tp->snd_nxt = tp->sack_newdata = tp->snd_una;
tcp_output(tp);
}
#endif /* INET6 */
/*
* Compute the MSS to advertise to the peer. Called only during
* the 3-way handshake. If we are the server (peer initiated
* connection), we are called with a pointer to the interface
2002-06-09 20:33:36 +04:00
* on which the SYN packet arrived. If we are the client (we
* initiated connection), we are called with a pointer to the
* interface out which this connection should go.
*
* NOTE: Do not subtract IP option/extension header size nor IPsec
* header size from MSS advertisement. MSS option must hold the maximum
* segment size we can accept, so it must always be:
* max(if mtu) - ip header - tcp header
*/
u_long
2005-02-04 02:50:33 +03:00
tcp_mss_to_advertise(const struct ifnet *ifp, int af)
{
extern u_long in_maxmtu;
u_long mss = 0;
u_long hdrsiz;
/*
* In order to avoid defeating path MTU discovery on the peer,
* we advertise the max MTU of all attached networks as our MSS,
* per RFC 1191, section 3.1.
*
* We provide the option to advertise just the MTU of
* the interface on which we hope this connection will
* be receiving. If we are responding to a SYN, we
* will have a pretty good idea about this, but when
* initiating a connection there is a bit more doubt.
*
* We also need to ensure that loopback has a large enough
* MSS, as the loopback MTU is never included in in_maxmtu.
*/
if (ifp != NULL)
switch (af) {
#ifdef INET6
case AF_INET6: /* FALLTHROUGH */
#endif
case AF_INET:
mss = ifp->if_mtu;
break;
}
if (tcp_mss_ifmtu == 0)
switch (af) {
#ifdef INET6
case AF_INET6: /* FALLTHROUGH */
#endif
case AF_INET:
Rename min/max -> uimin/uimax for better honesty. These functions are defined on unsigned int. The generic name min/max should not silently truncate to 32 bits on 64-bit systems. This is purely a name change -- no functional change intended. HOWEVER! Some subsystems have #define min(a, b) ((a) < (b) ? (a) : (b)) #define max(a, b) ((a) > (b) ? (a) : (b)) even though our standard name for that is MIN/MAX. Although these may invite multiple evaluation bugs, these do _not_ cause integer truncation. To avoid `fixing' these cases, I first changed the name in libkern, and then compile-tested every file where min/max occurred in order to confirm that it failed -- and thus confirm that nothing shadowed min/max -- before changing it. I have left a handful of bootloaders that are too annoying to compile-test, and some dead code: cobalt ews4800mips hp300 hppa ia64 luna68k vax acorn32/if_ie.c (not included in any kernels) macppc/if_gm.c (superseded by gem(4)) It should be easy to fix the fallout once identified -- this way of doing things fails safe, and the goal here, after all, is to _avoid_ silent integer truncations, not introduce them. Maybe one day we can reintroduce min/max as type-generic things that never silently truncate. But we should avoid doing that for a while, so that existing code has a chance to be detected by the compiler for conversion to uimin/uimax without changing the semantics until we can properly audit it all. (Who knows, maybe in some cases integer truncation is actually intended!)
2018-09-03 19:29:22 +03:00
mss = uimax(in_maxmtu, mss);
break;
}
switch (af) {
case AF_INET:
hdrsiz = sizeof(struct ip);
break;
1999-09-23 08:02:27 +04:00
#ifdef INET6
case AF_INET6:
hdrsiz = sizeof(struct ip6_hdr);
break;
1999-09-23 08:02:27 +04:00
#endif
default:
hdrsiz = 0;
break;
}
hdrsiz += sizeof(struct tcphdr);
if (mss > hdrsiz)
mss -= hdrsiz;
Rename min/max -> uimin/uimax for better honesty. These functions are defined on unsigned int. The generic name min/max should not silently truncate to 32 bits on 64-bit systems. This is purely a name change -- no functional change intended. HOWEVER! Some subsystems have #define min(a, b) ((a) < (b) ? (a) : (b)) #define max(a, b) ((a) > (b) ? (a) : (b)) even though our standard name for that is MIN/MAX. Although these may invite multiple evaluation bugs, these do _not_ cause integer truncation. To avoid `fixing' these cases, I first changed the name in libkern, and then compile-tested every file where min/max occurred in order to confirm that it failed -- and thus confirm that nothing shadowed min/max -- before changing it. I have left a handful of bootloaders that are too annoying to compile-test, and some dead code: cobalt ews4800mips hp300 hppa ia64 luna68k vax acorn32/if_ie.c (not included in any kernels) macppc/if_gm.c (superseded by gem(4)) It should be easy to fix the fallout once identified -- this way of doing things fails safe, and the goal here, after all, is to _avoid_ silent integer truncations, not introduce them. Maybe one day we can reintroduce min/max as type-generic things that never silently truncate. But we should avoid doing that for a while, so that existing code has a chance to be detected by the compiler for conversion to uimin/uimax without changing the semantics until we can properly audit it all. (Who knows, maybe in some cases integer truncation is actually intended!)
2018-09-03 19:29:22 +03:00
mss = uimax(tcp_mssdflt, mss);
return (mss);
}
/*
* Set connection variables based on the peer's advertised MSS.
* We are passed the TCPCB for the actual connection. If we
* are the server, we are called by the compressed state engine
* when the 3-way handshake is complete. If we are the client,
2001-06-12 19:17:10 +04:00
* we are called when we receive the SYN,ACK from the server.
*
* NOTE: Our advertised MSS value must be initialized in the TCPCB
* before this routine is called!
*/
void
2005-02-04 02:50:33 +03:00
tcp_mss_from_peer(struct tcpcb *tp, int offer)
{
struct socket *so;
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
struct rtentry *rt;
#endif
u_long bufsize;
int mss;
KASSERT(tp->t_inpcb != NULL);
so = NULL;
rt = NULL;
so = tp->t_inpcb->inp_socket;
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
rt = inpcb_rtentry(tp->t_inpcb);
#endif
/*
2002-06-09 20:33:36 +04:00
* As per RFC1122, use the default MSS value, unless they
* sent us an offer. Do not accept offers less than 256 bytes.
*/
mss = tcp_mssdflt;
if (offer)
mss = offer;
Rename min/max -> uimin/uimax for better honesty. These functions are defined on unsigned int. The generic name min/max should not silently truncate to 32 bits on 64-bit systems. This is purely a name change -- no functional change intended. HOWEVER! Some subsystems have #define min(a, b) ((a) < (b) ? (a) : (b)) #define max(a, b) ((a) > (b) ? (a) : (b)) even though our standard name for that is MIN/MAX. Although these may invite multiple evaluation bugs, these do _not_ cause integer truncation. To avoid `fixing' these cases, I first changed the name in libkern, and then compile-tested every file where min/max occurred in order to confirm that it failed -- and thus confirm that nothing shadowed min/max -- before changing it. I have left a handful of bootloaders that are too annoying to compile-test, and some dead code: cobalt ews4800mips hp300 hppa ia64 luna68k vax acorn32/if_ie.c (not included in any kernels) macppc/if_gm.c (superseded by gem(4)) It should be easy to fix the fallout once identified -- this way of doing things fails safe, and the goal here, after all, is to _avoid_ silent integer truncations, not introduce them. Maybe one day we can reintroduce min/max as type-generic things that never silently truncate. But we should avoid doing that for a while, so that existing code has a chance to be detected by the compiler for conversion to uimin/uimax without changing the semantics until we can properly audit it all. (Who knows, maybe in some cases integer truncation is actually intended!)
2018-09-03 19:29:22 +03:00
mss = uimax(mss, 256); /* sanity */
tp->t_peermss = mss;
mss -= tcp_optlen(tp);
if (tp->t_inpcb->inp_af == AF_INET)
mss -= ip_optlen(tp->t_inpcb);
#ifdef INET6
if (tp->t_inpcb->inp_af == AF_INET6)
mss -= ip6_optlen(tp->t_inpcb);
#endif
2018-05-23 21:40:29 +03:00
/*
* XXX XXX What if mss goes negative or zero? This can happen if a
* socket has large IPv6 options. We crash below.
*/
/*
* If there's a pipesize, change the socket buffer to that size.
* Make the socket buffer an integral number of MSS units. If
* the MSS is larger than the socket buffer, artificially decrease
* the MSS.
*/
#ifdef RTV_SPIPE
if (rt != NULL && rt->rt_rmx.rmx_sendpipe != 0)
bufsize = rt->rt_rmx.rmx_sendpipe;
else
#endif
{
KASSERT(so != NULL);
bufsize = so->so_snd.sb_hiwat;
}
if (bufsize < mss)
mss = bufsize;
else {
bufsize = roundup(bufsize, mss);
if (bufsize > sb_max)
bufsize = sb_max;
(void) sbreserve(&so->so_snd, bufsize, so);
}
tp->t_segsz = mss;
#ifdef RTV_SSTHRESH
if (rt != NULL && rt->rt_rmx.rmx_ssthresh) {
/*
* There's some sort of gateway or interface buffer
* limit on the path. Use this to set the slow
* start threshold, but set the threshold to no less
* than 2 * MSS.
*/
Rename min/max -> uimin/uimax for better honesty. These functions are defined on unsigned int. The generic name min/max should not silently truncate to 32 bits on 64-bit systems. This is purely a name change -- no functional change intended. HOWEVER! Some subsystems have #define min(a, b) ((a) < (b) ? (a) : (b)) #define max(a, b) ((a) > (b) ? (a) : (b)) even though our standard name for that is MIN/MAX. Although these may invite multiple evaluation bugs, these do _not_ cause integer truncation. To avoid `fixing' these cases, I first changed the name in libkern, and then compile-tested every file where min/max occurred in order to confirm that it failed -- and thus confirm that nothing shadowed min/max -- before changing it. I have left a handful of bootloaders that are too annoying to compile-test, and some dead code: cobalt ews4800mips hp300 hppa ia64 luna68k vax acorn32/if_ie.c (not included in any kernels) macppc/if_gm.c (superseded by gem(4)) It should be easy to fix the fallout once identified -- this way of doing things fails safe, and the goal here, after all, is to _avoid_ silent integer truncations, not introduce them. Maybe one day we can reintroduce min/max as type-generic things that never silently truncate. But we should avoid doing that for a while, so that existing code has a chance to be detected by the compiler for conversion to uimin/uimax without changing the semantics until we can properly audit it all. (Who knows, maybe in some cases integer truncation is actually intended!)
2018-09-03 19:29:22 +03:00
tp->snd_ssthresh = uimax(2 * mss, rt->rt_rmx.rmx_ssthresh);
}
#endif
#if defined(RTV_SPIPE) || defined(RTV_SSTHRESH)
inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}
/*
* Processing necessary when a TCP connection is established.
*/
void
2005-02-04 02:50:33 +03:00
tcp_established(struct tcpcb *tp)
{
struct socket *so;
#ifdef RTV_RPIPE
struct rtentry *rt;
#endif
u_long bufsize;
KASSERT(tp->t_inpcb != NULL);
so = NULL;
rt = NULL;
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
/* This is a while() to reduce the dreadful stairstepping below */
while (tp->t_inpcb->inp_af == AF_INET) {
so = tp->t_inpcb->inp_socket;
#if defined(RTV_RPIPE)
rt = inpcb_rtentry(tp->t_inpcb);
#endif
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
if (__predict_true(tcp_msl_enable)) {
if (in4p_laddr(tp->t_inpcb).s_addr == INADDR_LOOPBACK) {
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
break;
}
if (__predict_false(tcp_rttlocal)) {
/* This may be adjusted by tcp_input */
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
if (in_localaddr(in4p_faddr(tp->t_inpcb))) {
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
}
tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL;
break;
}
/* Clamp to a reasonable range. */
tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL);
#ifdef INET6
while (tp->t_inpcb->inp_af == AF_INET6) {
so = tp->t_inpcb->inp_socket;
#if defined(RTV_RPIPE)
2022-11-04 12:01:53 +03:00
rt = in6pcb_rtentry(tp->t_inpcb);
#endif
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
if (__predict_true(tcp_msl_enable)) {
extern const struct in6_addr in6addr_loopback;
if (IN6_ARE_ADDR_EQUAL(&in6p_laddr(tp->t_inpcb),
&in6addr_loopback)) {
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
tp->t_msl = tcp_msl_loop ? tcp_msl_loop : (TCPTV_MSL >> 2);
break;
}
if (__predict_false(tcp_rttlocal)) {
/* This may be adjusted by tcp_input */
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
if (in6_localaddr(&in6p_faddr(tp->t_inpcb))) {
Reduces the resources demanded by TCP sessions in TIME_WAIT-state using methods called Vestigial Time-Wait (VTW) and Maximum Segment Lifetime Truncation (MSLT). MSLT and VTW were contributed by Coyote Point Systems, Inc. Even after a TCP session enters the TIME_WAIT state, its corresponding socket and protocol control blocks (PCBs) stick around until the TCP Maximum Segment Lifetime (MSL) expires. On a host whose workload necessarily creates and closes down many TCP sockets, the sockets & PCBs for TCP sessions in TIME_WAIT state amount to many megabytes of dead weight in RAM. Maximum Segment Lifetimes Truncation (MSLT) assigns each TCP session to a class based on the nearness of the peer. Corresponding to each class is an MSL, and a session uses the MSL of its class. The classes are loopback (local host equals remote host), local (local host and remote host are on the same link/subnet), and remote (local host and remote host communicate via one or more gateways). Classes corresponding to nearer peers have lower MSLs by default: 2 seconds for loopback, 10 seconds for local, 60 seconds for remote. Loopback and local sessions expire more quickly when MSLT is used. Vestigial Time-Wait (VTW) replaces a TIME_WAIT session's PCB/socket dead weight with a compact representation of the session, called a "vestigial PCB". VTW data structures are designed to be very fast and memory-efficient: for fast insertion and lookup of vestigial PCBs, the PCBs are stored in a hash table that is designed to minimize the number of cacheline visits per lookup/insertion. The memory both for vestigial PCBs and for elements of the PCB hashtable come from fixed-size pools, and linked data structures exploit this to conserve memory by representing references with a narrow index/offset from the start of a pool instead of a pointer. When space for new vestigial PCBs runs out, VTW makes room by discarding old vestigial PCBs, oldest first. VTW cooperates with MSLT. It may help to think of VTW as a "FIN cache" by analogy to the SYN cache. A 2.8-GHz Pentium 4 running a test workload that creates TIME_WAIT sessions as fast as it can is approximately 17% idle when VTW is active versus 0% idle when VTW is inactive. It has 103 megabytes more free RAM when VTW is active (approximately 64k vestigial PCBs are created) than when it is inactive.
2011-05-03 22:28:44 +04:00
tp->t_msl = tcp_msl_local ? tcp_msl_local : (TCPTV_MSL >> 1);
break;
}
}
tp->t_msl = tcp_msl_remote ? tcp_msl_remote : TCPTV_MSL;
break;
}
/* Clamp to a reasonable range. */
tp->t_msl = MIN(tp->t_msl, TCP_MAXMSL);
#endif
tp->t_state = TCPS_ESTABLISHED;
TCP_TIMER_ARM(tp, TCPT_KEEP, tp->t_keepidle);
#ifdef RTV_RPIPE
if (rt != NULL && rt->rt_rmx.rmx_recvpipe != 0)
bufsize = rt->rt_rmx.rmx_recvpipe;
else
#endif
{
KASSERT(so != NULL);
bufsize = so->so_rcv.sb_hiwat;
}
if (bufsize > tp->t_ourmss) {
bufsize = roundup(bufsize, tp->t_ourmss);
if (bufsize > sb_max)
bufsize = sb_max;
(void) sbreserve(&so->so_rcv, bufsize, so);
}
#ifdef RTV_RPIPE
inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}
/*
* Check if there's an initial rtt or rttvar. Convert from the
* route-table units to scaled multiples of the slow timeout timer.
* Called only during the 3-way handshake.
*/
void
2005-02-04 02:50:33 +03:00
tcp_rmx_rtt(struct tcpcb *tp)
{
#ifdef RTV_RTT
struct rtentry *rt = NULL;
int rtt;
KASSERT(tp->t_inpcb != NULL);
rt = inpcb_rtentry(tp->t_inpcb);
if (rt == NULL)
return;
if (tp->t_srtt == 0 && (rtt = rt->rt_rmx.rmx_rtt)) {
/*
* XXX The lock bit for MTU indicates that the value
* is also a minimum value; this is subject to time.
*/
if (rt->rt_rmx.rmx_locks & RTV_RTT)
TCPT_RANGESET(tp->t_rttmin,
rtt / (RTM_RTTUNIT / PR_SLOWHZ),
TCPTV_MIN, TCPTV_REXMTMAX);
tp->t_srtt = rtt /
((RTM_RTTUNIT / PR_SLOWHZ) >> (TCP_RTT_SHIFT + 2));
if (rt->rt_rmx.rmx_rttvar) {
tp->t_rttvar = rt->rt_rmx.rmx_rttvar /
((RTM_RTTUNIT / PR_SLOWHZ) >>
(TCP_RTTVAR_SHIFT + 2));
} else {
/* Default variation is +- 1 rtt */
tp->t_rttvar =
tp->t_srtt >> (TCP_RTT_SHIFT - TCP_RTTVAR_SHIFT);
}
TCPT_RANGESET(tp->t_rxtcur,
((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2),
tp->t_rttmin, TCPTV_REXMTMAX);
}
inpcb_rtentry_unref(rt, tp->t_inpcb);
#endif
}
tcp_seq tcp_iss_seq = 0; /* tcp initial seq # */
/*
* Get a new sequence value given a tcp control block
*/
tcp_seq
tcp_new_iss(struct tcpcb *tp)
{
if (tp->t_inpcb->inp_af == AF_INET) {
return tcp_new_iss1(&in4p_laddr(tp->t_inpcb),
&in4p_faddr(tp->t_inpcb), tp->t_inpcb->inp_lport,
tp->t_inpcb->inp_fport, sizeof(in4p_laddr(tp->t_inpcb)));
}
#ifdef INET6
if (tp->t_inpcb->inp_af == AF_INET6) {
return tcp_new_iss1(&in6p_laddr(tp->t_inpcb),
&in6p_faddr(tp->t_inpcb), tp->t_inpcb->inp_lport,
tp->t_inpcb->inp_fport, sizeof(in6p_laddr(tp->t_inpcb)));
}
#endif
panic("tcp_new_iss: unreachable");
}
static u_int8_t tcp_iss_secret[16]; /* 128 bits; should be plenty */
/*
* Initialize RFC 1948 ISS Secret
*/
static int
tcp_iss_secret_init(void)
{
cprng_strong(kern_cprng,
tcp_iss_secret, sizeof(tcp_iss_secret), 0);
return 0;
}
/*
* This routine actually generates a new TCP initial sequence number.
*/
tcp_seq
tcp_new_iss1(void *laddr, void *faddr, u_int16_t lport, u_int16_t fport,
size_t addrsz)
{
tcp_seq tcp_iss;
if (tcp_do_rfc1948) {
MD5_CTX ctx;
u_int8_t hash[16]; /* XXX MD5 knowledge */
static ONCE_DECL(tcp_iss_secret_control);
/*
* If we haven't been here before, initialize our cryptographic
* hash secret.
*/
RUN_ONCE(&tcp_iss_secret_control, tcp_iss_secret_init);
/*
* Compute the base value of the ISS. It is a hash
* of (saddr, sport, daddr, dport, secret).
*/
MD5Init(&ctx);
MD5Update(&ctx, (u_char *) laddr, addrsz);
MD5Update(&ctx, (u_char *) &lport, sizeof(lport));
MD5Update(&ctx, (u_char *) faddr, addrsz);
MD5Update(&ctx, (u_char *) &fport, sizeof(fport));
MD5Update(&ctx, tcp_iss_secret, sizeof(tcp_iss_secret));
MD5Final(hash, &ctx);
memcpy(&tcp_iss, hash, sizeof(tcp_iss));
#ifdef TCPISS_DEBUG
printf("ISS hash 0x%08x, ", tcp_iss);
#endif
First step of random number subsystem rework described in <20111022023242.BA26F14A158@mail.netbsd.org>. This change includes the following: An initial cleanup and minor reorganization of the entropy pool code in sys/dev/rnd.c and sys/dev/rndpool.c. Several bugs are fixed. Some effort is made to accumulate entropy more quickly at boot time. A generic interface, "rndsink", is added, for stream generators to request that they be re-keyed with good quality entropy from the pool as soon as it is available. The arc4random()/arc4randbytes() implementation in libkern is adjusted to use the rndsink interface for rekeying, which helps address the problem of low-quality keys at boot time. An implementation of the FIPS 140-2 statistical tests for random number generator quality is provided (libkern/rngtest.c). This is based on Greg Rose's implementation from Qualcomm. A new random stream generator, nist_ctr_drbg, is provided. It is based on an implementation of the NIST SP800-90 CTR_DRBG by Henric Jungheim. This generator users AES in a modified counter mode to generate a backtracking-resistant random stream. An abstraction layer, "cprng", is provided for in-kernel consumers of randomness. The arc4random/arc4randbytes API is deprecated for in-kernel use. It is replaced by "cprng_strong". The current cprng_fast implementation wraps the existing arc4random implementation. The current cprng_strong implementation wraps the new CTR_DRBG implementation. Both interfaces are rekeyed from the entropy pool automatically at intervals justifiable from best current cryptographic practice. In some quick tests, cprng_fast() is about the same speed as the old arc4randbytes(), and cprng_strong() is about 20% faster than rnd_extract_data(). Performance is expected to improve. The AES code in src/crypto/rijndael is no longer an optional kernel component, as it is required by cprng_strong, which is not an optional kernel component. The entropy pool output is subjected to the rngtest tests at startup time; if it fails, the system will reboot. There is approximately a 3/10000 chance of a false positive from these tests. Entropy pool _input_ from hardware random numbers is subjected to the rngtest tests at attach time, as well as the FIPS continuous-output test, to detect bad or stuck hardware RNGs; if any are detected, they are detached, but the system continues to run. A problem with rndctl(8) is fixed -- datastructures with pointers in arrays are no longer passed to userspace (this was not a security problem, but rather a major issue for compat32). A new kernel will require a new rndctl. The sysctl kern.arandom() and kern.urandom() nodes are hooked up to the new generators, but the /dev/*random pseudodevices are not, yet. Manual pages for the new kernel interfaces are forthcoming.
2011-11-20 02:51:18 +04:00
} else {
/*
* Randomize.
*/
tcp_iss = cprng_fast32() & TCP_ISS_RANDOM_MASK;
#ifdef TCPISS_DEBUG
printf("ISS random 0x%08x, ", tcp_iss);
#endif
}
/*
* Add the offset in to the computed value.
*/
tcp_iss += tcp_iss_seq;
#ifdef TCPISS_DEBUG
printf("ISS %08x\n", tcp_iss);
#endif
return tcp_iss;
}
#if defined(IPSEC)
/* compute ESP/AH header size for TCP, including outer IP header. */
size_t
2005-02-04 02:50:33 +03:00
ipsec4_hdrsiz_tcp(struct tcpcb *tp)
{
struct inpcb *inp;
size_t hdrsiz;
/* XXX mapped addr case (tp->t_inpcb) */
if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
return 0;
switch (tp->t_family) {
case AF_INET:
/* XXX: should use correct direction. */
hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp);
break;
default:
hdrsiz = 0;
break;
}
return hdrsiz;
}
#ifdef INET6
size_t
2005-02-04 02:50:33 +03:00
ipsec6_hdrsiz_tcp(struct tcpcb *tp)
{
struct inpcb *inp;
size_t hdrsiz;
if (!tp || !tp->t_template || !(inp = tp->t_inpcb))
return 0;
switch (tp->t_family) {
case AF_INET6:
/* XXX: should use correct direction. */
hdrsiz = ipsec_hdrsiz(tp->t_template, IPSEC_DIR_OUTBOUND, inp);
break;
case AF_INET:
/* mapped address case - tricky */
default:
hdrsiz = 0;
break;
}
return hdrsiz;
}
#endif
#endif /*IPSEC*/
/*
* Determine the length of the TCP options for this connection.
2002-06-09 20:33:36 +04:00
*
* XXX: What do we do for SACK, when we add that? Just reserve
* all of the space? Otherwise we can't exactly be incrementing
* cwnd by an amount that varies depending on the amount we last
* had to SACK!
*/
u_int
2005-02-04 02:50:33 +03:00
tcp_optlen(struct tcpcb *tp)
{
Initial commit of a port of the FreeBSD implementation of RFC 2385 (MD5 signatures for TCP, as used with BGP). Credit for original FreeBSD code goes to Bruce M. Simpson, with FreeBSD sponsorship credited to sentex.net. Shortening of the setsockopt() name attributed to Vincent Jardin. This commit is a minimal, working version of the FreeBSD code, as MFC'ed to FreeBSD-4. It has received minimal testing with a ttcp modified to set the TCP-MD5 option; BMS's additions to tcpdump-current (tcpdump -M) confirm that the MD5 signatures are correct. Committed as-is for further testing between a NetBSD BGP speaker (e.g., quagga) and industry-standard BGP speakers (e.g., Cisco, Juniper). NOTE: This version has two potential flaws. First, I do see any code that verifies recieved TCP-MD5 signatures. Second, the TCP-MD5 options are internally padded and assumed to be 32-bit aligned. A more space-efficient scheme is to pack all TCP options densely (and possibly unaligned) into the TCP header ; then do one final padding to a 4-byte boundary. Pre-existing comments note that accounting for TCP-option space when we add SACK is yet to be done. For now, I'm punting on that; we can solve it properly, in a way that will handle SACK blocks, as a separate exercise. In case a pullup to NetBSD-2 is requested, this adds sys/netipsec/xform_tcp.c ,and modifies: sys/net/pfkeyv2.h,v 1.15 sys/netinet/files.netinet,v 1.5 sys/netinet/ip.h,v 1.25 sys/netinet/tcp.h,v 1.15 sys/netinet/tcp_input.c,v 1.200 sys/netinet/tcp_output.c,v 1.109 sys/netinet/tcp_subr.c,v 1.165 sys/netinet/tcp_usrreq.c,v 1.89 sys/netinet/tcp_var.h,v 1.109 sys/netipsec/files.netipsec,v 1.3 sys/netipsec/ipsec.c,v 1.11 sys/netipsec/ipsec.h,v 1.7 sys/netipsec/key.c,v 1.11 share/man/man4/tcp.4,v 1.16 lib/libipsec/pfkey.c,v 1.20 lib/libipsec/pfkey_dump.c,v 1.17 lib/libipsec/policy_token.l,v 1.8 sbin/setkey/parse.y,v 1.14 sbin/setkey/setkey.8,v 1.27 sbin/setkey/token.l,v 1.15 Note that the preceding two revisions to tcp.4 will be required to cleanly apply this diff.
2004-04-26 02:25:03 +04:00
u_int optlen;
optlen = 0;
2002-06-09 20:33:36 +04:00
if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) ==
(TF_REQ_TSTMP | TF_RCVD_TSTMP))
Initial commit of a port of the FreeBSD implementation of RFC 2385 (MD5 signatures for TCP, as used with BGP). Credit for original FreeBSD code goes to Bruce M. Simpson, with FreeBSD sponsorship credited to sentex.net. Shortening of the setsockopt() name attributed to Vincent Jardin. This commit is a minimal, working version of the FreeBSD code, as MFC'ed to FreeBSD-4. It has received minimal testing with a ttcp modified to set the TCP-MD5 option; BMS's additions to tcpdump-current (tcpdump -M) confirm that the MD5 signatures are correct. Committed as-is for further testing between a NetBSD BGP speaker (e.g., quagga) and industry-standard BGP speakers (e.g., Cisco, Juniper). NOTE: This version has two potential flaws. First, I do see any code that verifies recieved TCP-MD5 signatures. Second, the TCP-MD5 options are internally padded and assumed to be 32-bit aligned. A more space-efficient scheme is to pack all TCP options densely (and possibly unaligned) into the TCP header ; then do one final padding to a 4-byte boundary. Pre-existing comments note that accounting for TCP-option space when we add SACK is yet to be done. For now, I'm punting on that; we can solve it properly, in a way that will handle SACK blocks, as a separate exercise. In case a pullup to NetBSD-2 is requested, this adds sys/netipsec/xform_tcp.c ,and modifies: sys/net/pfkeyv2.h,v 1.15 sys/netinet/files.netinet,v 1.5 sys/netinet/ip.h,v 1.25 sys/netinet/tcp.h,v 1.15 sys/netinet/tcp_input.c,v 1.200 sys/netinet/tcp_output.c,v 1.109 sys/netinet/tcp_subr.c,v 1.165 sys/netinet/tcp_usrreq.c,v 1.89 sys/netinet/tcp_var.h,v 1.109 sys/netipsec/files.netipsec,v 1.3 sys/netipsec/ipsec.c,v 1.11 sys/netipsec/ipsec.h,v 1.7 sys/netipsec/key.c,v 1.11 share/man/man4/tcp.4,v 1.16 lib/libipsec/pfkey.c,v 1.20 lib/libipsec/pfkey_dump.c,v 1.17 lib/libipsec/policy_token.l,v 1.8 sbin/setkey/parse.y,v 1.14 sbin/setkey/setkey.8,v 1.27 sbin/setkey/token.l,v 1.15 Note that the preceding two revisions to tcp.4 will be required to cleanly apply this diff.
2004-04-26 02:25:03 +04:00
optlen += TCPOLEN_TSTAMP_APPA;
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE)
optlen += TCPOLEN_SIGLEN;
#endif
Initial commit of a port of the FreeBSD implementation of RFC 2385 (MD5 signatures for TCP, as used with BGP). Credit for original FreeBSD code goes to Bruce M. Simpson, with FreeBSD sponsorship credited to sentex.net. Shortening of the setsockopt() name attributed to Vincent Jardin. This commit is a minimal, working version of the FreeBSD code, as MFC'ed to FreeBSD-4. It has received minimal testing with a ttcp modified to set the TCP-MD5 option; BMS's additions to tcpdump-current (tcpdump -M) confirm that the MD5 signatures are correct. Committed as-is for further testing between a NetBSD BGP speaker (e.g., quagga) and industry-standard BGP speakers (e.g., Cisco, Juniper). NOTE: This version has two potential flaws. First, I do see any code that verifies recieved TCP-MD5 signatures. Second, the TCP-MD5 options are internally padded and assumed to be 32-bit aligned. A more space-efficient scheme is to pack all TCP options densely (and possibly unaligned) into the TCP header ; then do one final padding to a 4-byte boundary. Pre-existing comments note that accounting for TCP-option space when we add SACK is yet to be done. For now, I'm punting on that; we can solve it properly, in a way that will handle SACK blocks, as a separate exercise. In case a pullup to NetBSD-2 is requested, this adds sys/netipsec/xform_tcp.c ,and modifies: sys/net/pfkeyv2.h,v 1.15 sys/netinet/files.netinet,v 1.5 sys/netinet/ip.h,v 1.25 sys/netinet/tcp.h,v 1.15 sys/netinet/tcp_input.c,v 1.200 sys/netinet/tcp_output.c,v 1.109 sys/netinet/tcp_subr.c,v 1.165 sys/netinet/tcp_usrreq.c,v 1.89 sys/netinet/tcp_var.h,v 1.109 sys/netipsec/files.netipsec,v 1.3 sys/netipsec/ipsec.c,v 1.11 sys/netipsec/ipsec.h,v 1.7 sys/netipsec/key.c,v 1.11 share/man/man4/tcp.4,v 1.16 lib/libipsec/pfkey.c,v 1.20 lib/libipsec/pfkey_dump.c,v 1.17 lib/libipsec/policy_token.l,v 1.8 sbin/setkey/parse.y,v 1.14 sbin/setkey/setkey.8,v 1.27 sbin/setkey/token.l,v 1.15 Note that the preceding two revisions to tcp.4 will be required to cleanly apply this diff.
2004-04-26 02:25:03 +04:00
return optlen;
}
u_int
tcp_hdrsz(struct tcpcb *tp)
{
u_int hlen;
switch (tp->t_family) {
#ifdef INET6
case AF_INET6:
hlen = sizeof(struct ip6_hdr);
break;
#endif
case AF_INET:
hlen = sizeof(struct ip);
break;
default:
hlen = 0;
break;
}
hlen += sizeof(struct tcphdr);
if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
(tp->t_flags & TF_RCVD_TSTMP) == TF_RCVD_TSTMP)
hlen += TCPOLEN_TSTAMP_APPA;
#ifdef TCP_SIGNATURE
if (tp->t_flags & TF_SIGNATURE)
hlen += TCPOLEN_SIGLEN;
#endif
return hlen;
}
void
tcp_statinc(u_int stat)
{
KASSERT(stat < TCP_NSTATS);
TCP_STATINC(stat);
}
void
tcp_statadd(u_int stat, uint64_t val)
{
KASSERT(stat < TCP_NSTATS);
TCP_STATADD(stat, val);
}