2017-01-04 18:09:37 +03:00
|
|
|
/* $NetBSD: tcp_output.c,v 1.194 2017/01/04 15:09:37 martin Exp $ */
|
1999-07-01 12:12:45 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Copyright (C) 1995, 1996, 1997, and 1998 WIDE Project.
|
|
|
|
* All rights reserved.
|
2002-06-09 20:33:36 +04:00
|
|
|
*
|
1999-07-01 12:12:45 +04:00
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. Neither the name of the project nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
2002-06-09 20:33:36 +04:00
|
|
|
*
|
1999-07-01 12:12:45 +04:00
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE PROJECT AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE PROJECT OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*/
|
1998-02-19 05:36:42 +03:00
|
|
|
|
2002-01-24 05:12:29 +03:00
|
|
|
/*
|
|
|
|
* @(#)COPYRIGHT 1.1 (NRL) 17 January 1995
|
2002-06-09 20:33:36 +04:00
|
|
|
*
|
2002-01-24 05:12:29 +03:00
|
|
|
* NRL grants permission for redistribution and use in source and binary
|
|
|
|
* forms, with or without modification, of the software and documentation
|
|
|
|
* created at NRL provided that the following conditions are met:
|
2002-06-09 20:33:36 +04:00
|
|
|
*
|
2002-01-24 05:12:29 +03:00
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
|
|
* must display the following acknowledgements:
|
|
|
|
* This product includes software developed by the University of
|
|
|
|
* California, Berkeley and its contributors.
|
|
|
|
* This product includes software developed at the Information
|
|
|
|
* Technology Division, US Naval Research Laboratory.
|
|
|
|
* 4. Neither the name of the NRL nor the names of its contributors
|
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
2002-06-09 20:33:36 +04:00
|
|
|
*
|
2002-01-24 05:12:29 +03:00
|
|
|
* THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS
|
|
|
|
* IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
|
|
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
|
|
|
|
* PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR
|
|
|
|
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
|
|
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
|
|
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
|
|
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
|
|
* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
|
|
* NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
2002-06-09 20:33:36 +04:00
|
|
|
*
|
2002-01-24 05:12:29 +03:00
|
|
|
* The views and conclusions contained in the software and documentation
|
|
|
|
* are those of the authors and should not be interpreted as representing
|
|
|
|
* official policies, either expressed or implied, of the US Naval
|
|
|
|
* Research Laboratory (NRL).
|
|
|
|
*/
|
|
|
|
|
1998-02-19 05:36:42 +03:00
|
|
|
/*-
|
2006-09-05 04:29:35 +04:00
|
|
|
* Copyright (c) 1997, 1998, 2001, 2005, 2006 The NetBSD Foundation, Inc.
|
1998-02-19 05:36:42 +03:00
|
|
|
* All rights reserved.
|
|
|
|
*
|
|
|
|
* This code is derived from software contributed to The NetBSD Foundation
|
|
|
|
* by Jason R. Thorpe and Kevin M. Lahey of the Numerical Aerospace Simulation
|
|
|
|
* Facility, NASA Ames Research Center.
|
2005-03-02 13:20:18 +03:00
|
|
|
* This code is derived from software contributed to The NetBSD Foundation
|
|
|
|
* by Charles M. Hannum.
|
2006-09-05 04:29:35 +04:00
|
|
|
* This code is derived from software contributed to The NetBSD Foundation
|
|
|
|
* by Rui Paulo.
|
1998-02-19 05:36:42 +03:00
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
|
|
|
|
* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
|
|
|
|
* TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
|
|
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
|
|
|
|
* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
|
|
|
* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
|
|
|
* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
|
|
|
* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
|
|
|
* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
|
|
|
* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
|
|
|
* POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
*/
|
1994-06-29 10:29:24 +04:00
|
|
|
|
1993-03-21 12:45:37 +03:00
|
|
|
/*
|
1998-01-05 13:31:44 +03:00
|
|
|
* Copyright (c) 1982, 1986, 1988, 1990, 1993, 1995
|
1994-05-13 10:02:48 +04:00
|
|
|
* The Regents of the University of California. All rights reserved.
|
1993-03-21 12:45:37 +03:00
|
|
|
*
|
|
|
|
* Redistribution and use in source and binary forms, with or without
|
|
|
|
* modification, are permitted provided that the following conditions
|
|
|
|
* are met:
|
|
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer.
|
|
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
|
|
* documentation and/or other materials provided with the distribution.
|
2003-08-07 20:26:28 +04:00
|
|
|
* 3. Neither the name of the University nor the names of its contributors
|
1993-03-21 12:45:37 +03:00
|
|
|
* may be used to endorse or promote products derived from this software
|
|
|
|
* without specific prior written permission.
|
|
|
|
*
|
|
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
* SUCH DAMAGE.
|
|
|
|
*
|
1998-01-05 13:31:44 +03:00
|
|
|
* @(#)tcp_output.c 8.4 (Berkeley) 5/24/95
|
1993-03-21 12:45:37 +03:00
|
|
|
*/
|
|
|
|
|
2001-11-13 03:32:34 +03:00
|
|
|
#include <sys/cdefs.h>
|
2017-01-04 18:09:37 +03:00
|
|
|
__KERNEL_RCSID(0, "$NetBSD: tcp_output.c,v 1.194 2017/01/04 15:09:37 martin Exp $");
|
2001-11-13 03:32:34 +03:00
|
|
|
|
2015-08-25 01:21:26 +03:00
|
|
|
#ifdef _KERNEL_OPT
|
1999-07-01 12:12:45 +04:00
|
|
|
#include "opt_inet.h"
|
1999-07-10 02:57:15 +04:00
|
|
|
#include "opt_ipsec.h"
|
2001-07-08 20:18:56 +04:00
|
|
|
#include "opt_tcp_debug.h"
|
2015-08-25 01:21:26 +03:00
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
|
1993-12-18 03:40:47 +03:00
|
|
|
#include <sys/param.h>
|
|
|
|
#include <sys/systm.h>
|
|
|
|
#include <sys/mbuf.h>
|
|
|
|
#include <sys/protosw.h>
|
|
|
|
#include <sys/socket.h>
|
|
|
|
#include <sys/socketvar.h>
|
|
|
|
#include <sys/errno.h>
|
1999-07-01 12:12:45 +04:00
|
|
|
#include <sys/domain.h>
|
2001-09-11 02:14:26 +04:00
|
|
|
#include <sys/kernel.h>
|
2004-05-18 18:44:14 +04:00
|
|
|
#ifdef TCP_SIGNATURE
|
|
|
|
#include <sys/md5.h>
|
|
|
|
#endif
|
1993-03-21 12:45:37 +03:00
|
|
|
|
1997-09-23 01:49:55 +04:00
|
|
|
#include <net/if.h>
|
1993-12-18 03:40:47 +03:00
|
|
|
#include <net/route.h>
|
1993-03-21 12:45:37 +03:00
|
|
|
|
1993-12-18 03:40:47 +03:00
|
|
|
#include <netinet/in.h>
|
|
|
|
#include <netinet/in_systm.h>
|
|
|
|
#include <netinet/ip.h>
|
|
|
|
#include <netinet/in_pcb.h>
|
|
|
|
#include <netinet/ip_var.h>
|
1999-07-01 12:12:45 +04:00
|
|
|
|
|
|
|
#ifdef INET6
|
|
|
|
#ifndef INET
|
|
|
|
#include <netinet/in.h>
|
|
|
|
#endif
|
|
|
|
#include <netinet/ip6.h>
|
2002-05-29 11:53:39 +04:00
|
|
|
#include <netinet6/in6_var.h>
|
1999-07-01 12:12:45 +04:00
|
|
|
#include <netinet6/ip6_var.h>
|
2002-05-29 11:53:39 +04:00
|
|
|
#include <netinet6/in6_pcb.h>
|
|
|
|
#include <netinet6/nd6.h>
|
1999-07-01 12:12:45 +04:00
|
|
|
#endif
|
|
|
|
|
2013-06-05 23:01:26 +04:00
|
|
|
#ifdef IPSEC
|
2003-08-15 07:42:00 +04:00
|
|
|
#include <netipsec/ipsec.h>
|
2004-05-21 02:59:02 +04:00
|
|
|
#include <netipsec/key.h>
|
2007-02-10 12:43:05 +03:00
|
|
|
#ifdef INET6
|
|
|
|
#include <netipsec/ipsec6.h>
|
|
|
|
#endif
|
2013-06-05 23:01:26 +04:00
|
|
|
#endif /* IPSEC*/
|
2002-11-02 22:03:44 +03:00
|
|
|
|
1993-12-18 03:40:47 +03:00
|
|
|
#include <netinet/tcp.h>
|
1993-03-21 12:45:37 +03:00
|
|
|
#define TCPOUTFLAGS
|
1993-12-18 03:40:47 +03:00
|
|
|
#include <netinet/tcp_fsm.h>
|
|
|
|
#include <netinet/tcp_seq.h>
|
|
|
|
#include <netinet/tcp_timer.h>
|
|
|
|
#include <netinet/tcp_var.h>
|
2008-04-12 09:58:22 +04:00
|
|
|
#include <netinet/tcp_private.h>
|
2006-10-09 20:27:07 +04:00
|
|
|
#include <netinet/tcp_congctl.h>
|
1993-12-18 03:40:47 +03:00
|
|
|
#include <netinet/tcpip.h>
|
|
|
|
#include <netinet/tcp_debug.h>
|
2005-04-19 01:55:06 +04:00
|
|
|
#include <netinet/in_offload.h>
|
2006-11-23 22:41:58 +03:00
|
|
|
#include <netinet6/in6_offload.h>
|
1993-03-21 12:45:37 +03:00
|
|
|
|
|
|
|
#ifdef notyet
|
|
|
|
extern struct mbuf *m_copypack();
|
|
|
|
#endif
|
|
|
|
|
1998-04-02 02:15:52 +04:00
|
|
|
/*
|
2005-06-28 23:16:02 +04:00
|
|
|
* Knob to enable Congestion Window Monitoring, and control
|
1998-04-30 22:27:20 +04:00
|
|
|
* the burst size it allows. Default burst is 4 packets, per
|
|
|
|
* the Internet draft.
|
1998-04-02 02:15:52 +04:00
|
|
|
*/
|
2002-06-13 20:31:05 +04:00
|
|
|
int tcp_cwm = 0;
|
1998-04-30 22:27:20 +04:00
|
|
|
int tcp_cwm_burstsize = 4;
|
1998-04-02 02:15:52 +04:00
|
|
|
|
2010-01-26 21:09:07 +03:00
|
|
|
int tcp_do_autosndbuf = 1;
|
2007-08-02 06:42:40 +04:00
|
|
|
int tcp_autosndbuf_inc = 8 * 1024;
|
|
|
|
int tcp_autosndbuf_max = 256 * 1024;
|
|
|
|
|
2002-04-27 05:47:58 +04:00
|
|
|
#ifdef TCP_OUTPUT_COUNTERS
|
|
|
|
#include <sys/device.h>
|
|
|
|
|
|
|
|
extern struct evcnt tcp_output_bigheader;
|
2003-10-24 07:12:53 +04:00
|
|
|
extern struct evcnt tcp_output_predict_hit;
|
|
|
|
extern struct evcnt tcp_output_predict_miss;
|
2002-04-27 05:47:58 +04:00
|
|
|
extern struct evcnt tcp_output_copysmall;
|
|
|
|
extern struct evcnt tcp_output_copybig;
|
|
|
|
extern struct evcnt tcp_output_refbig;
|
|
|
|
|
|
|
|
#define TCP_OUTPUT_COUNTER_INCR(ev) (ev)->ev_count++
|
|
|
|
#else
|
|
|
|
|
|
|
|
#define TCP_OUTPUT_COUNTER_INCR(ev) /* nothing */
|
|
|
|
|
|
|
|
#endif /* TCP_OUTPUT_COUNTERS */
|
|
|
|
|
2001-07-31 04:57:45 +04:00
|
|
|
static
|
|
|
|
#ifndef GPROF
|
2005-12-24 23:45:08 +03:00
|
|
|
inline
|
2001-07-31 04:57:45 +04:00
|
|
|
#endif
|
2004-02-04 08:36:03 +03:00
|
|
|
int
|
2006-11-23 22:41:58 +03:00
|
|
|
tcp_segsize(struct tcpcb *tp, int *txsegsizep, int *rxsegsizep,
|
2007-02-22 01:59:35 +03:00
|
|
|
bool *alwaysfragp)
|
1997-09-23 01:49:55 +04:00
|
|
|
{
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1997-09-23 01:49:55 +04:00
|
|
|
struct inpcb *inp = tp->t_inpcb;
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
#ifdef INET6
|
|
|
|
struct in6pcb *in6p = tp->t_in6pcb;
|
|
|
|
#endif
|
2002-11-24 13:51:56 +03:00
|
|
|
struct socket *so = NULL;
|
1997-09-23 01:49:55 +04:00
|
|
|
struct rtentry *rt;
|
|
|
|
struct ifnet *ifp;
|
|
|
|
int size;
|
2006-03-25 16:34:35 +03:00
|
|
|
int hdrlen;
|
2001-12-03 04:45:43 +03:00
|
|
|
int optlen;
|
1999-07-01 12:12:45 +04:00
|
|
|
|
2007-02-22 09:16:03 +03:00
|
|
|
*alwaysfragp = false;
|
2006-11-23 22:41:58 +03:00
|
|
|
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (tp->t_inpcb && tp->t_in6pcb)
|
|
|
|
panic("tcp_segsize: both t_inpcb and t_in6pcb are set");
|
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
switch (tp->t_family) {
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1999-07-01 12:12:45 +04:00
|
|
|
case AF_INET:
|
2006-03-25 16:34:35 +03:00
|
|
|
hdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
|
1999-07-01 12:12:45 +04:00
|
|
|
break;
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
#ifdef INET6
|
|
|
|
case AF_INET6:
|
2006-03-25 16:34:35 +03:00
|
|
|
hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
|
1999-07-01 12:12:45 +04:00
|
|
|
break;
|
|
|
|
#endif
|
|
|
|
default:
|
|
|
|
size = tcp_mssdflt;
|
|
|
|
goto out;
|
|
|
|
}
|
1997-09-23 01:49:55 +04:00
|
|
|
|
2000-10-17 07:06:42 +04:00
|
|
|
rt = NULL;
|
|
|
|
#ifdef INET
|
2002-08-20 20:29:42 +04:00
|
|
|
if (inp) {
|
1999-07-01 12:12:45 +04:00
|
|
|
rt = in_pcbrtentry(inp);
|
2002-08-20 20:29:42 +04:00
|
|
|
so = inp->inp_socket;
|
|
|
|
}
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
2000-10-20 00:22:59 +04:00
|
|
|
#ifdef INET6
|
2002-08-20 20:29:42 +04:00
|
|
|
if (in6p) {
|
1999-07-01 12:12:45 +04:00
|
|
|
rt = in6_pcbrtentry(in6p);
|
2002-08-20 20:29:42 +04:00
|
|
|
so = in6p->in6p_socket;
|
|
|
|
}
|
1999-07-01 12:12:45 +04:00
|
|
|
#endif
|
|
|
|
if (rt == NULL) {
|
1997-09-23 01:49:55 +04:00
|
|
|
size = tcp_mssdflt;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
ifp = rt->rt_ifp;
|
|
|
|
|
1999-07-01 12:12:45 +04:00
|
|
|
size = tcp_mssdflt;
|
2004-02-04 08:36:03 +03:00
|
|
|
if (tp->t_mtudisc && rt->rt_rmx.rmx_mtu != 0) {
|
|
|
|
#ifdef INET6
|
|
|
|
if (in6p && rt->rt_rmx.rmx_mtu < IPV6_MMTU) {
|
|
|
|
/*
|
|
|
|
* RFC2460 section 5, last paragraph: if path MTU is
|
|
|
|
* smaller than 1280, use 1280 as packet size and
|
|
|
|
* attach fragment header.
|
|
|
|
*/
|
2006-03-25 16:34:35 +03:00
|
|
|
size = IPV6_MMTU - hdrlen - sizeof(struct ip6_frag);
|
2007-02-22 09:16:03 +03:00
|
|
|
*alwaysfragp = true;
|
2004-02-04 08:36:03 +03:00
|
|
|
} else
|
2006-03-25 16:34:35 +03:00
|
|
|
size = rt->rt_rmx.rmx_mtu - hdrlen;
|
2004-02-04 08:36:03 +03:00
|
|
|
#else
|
2006-03-25 16:34:35 +03:00
|
|
|
size = rt->rt_rmx.rmx_mtu - hdrlen;
|
2004-02-04 08:36:03 +03:00
|
|
|
#endif
|
|
|
|
} else if (ifp->if_flags & IFF_LOOPBACK)
|
2006-03-25 16:34:35 +03:00
|
|
|
size = ifp->if_mtu - hdrlen;
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
2002-05-26 20:05:43 +04:00
|
|
|
else if (inp && tp->t_mtudisc)
|
2006-03-25 16:34:35 +03:00
|
|
|
size = ifp->if_mtu - hdrlen;
|
1999-07-01 12:12:45 +04:00
|
|
|
else if (inp && in_localaddr(inp->inp_faddr))
|
2006-03-25 16:34:35 +03:00
|
|
|
size = ifp->if_mtu - hdrlen;
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
#ifdef INET6
|
|
|
|
else if (in6p) {
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1999-07-01 12:12:45 +04:00
|
|
|
if (IN6_IS_ADDR_V4MAPPED(&in6p->in6p_faddr)) {
|
|
|
|
/* mapped addr case */
|
|
|
|
struct in_addr d;
|
|
|
|
bcopy(&in6p->in6p_faddr.s6_addr32[3], &d, sizeof(d));
|
2002-05-26 20:05:43 +04:00
|
|
|
if (tp->t_mtudisc || in_localaddr(d))
|
2006-03-25 16:34:35 +03:00
|
|
|
size = ifp->if_mtu - hdrlen;
|
2000-10-17 07:06:42 +04:00
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
{
|
2001-04-03 10:14:31 +04:00
|
|
|
/*
|
|
|
|
* for IPv6, path MTU discovery is always turned on,
|
|
|
|
* or the node must use packet size <= 1280.
|
|
|
|
*/
|
2002-05-29 11:53:39 +04:00
|
|
|
size = tp->t_mtudisc ? IN6_LINKMTU(ifp) : IPV6_MMTU;
|
2006-03-25 16:34:35 +03:00
|
|
|
size -= hdrlen;
|
1999-07-01 12:12:45 +04:00
|
|
|
}
|
|
|
|
}
|
2016-12-08 08:16:33 +03:00
|
|
|
#endif
|
|
|
|
#ifdef INET
|
|
|
|
if (inp)
|
|
|
|
in_pcbrtentry_unref(rt, inp);
|
|
|
|
#endif
|
|
|
|
#ifdef INET6
|
|
|
|
if (in6p)
|
|
|
|
in6_pcbrtentry_unref(rt, in6p);
|
1999-07-01 12:12:45 +04:00
|
|
|
#endif
|
2002-03-02 01:54:09 +03:00
|
|
|
out:
|
2001-12-03 04:45:43 +03:00
|
|
|
/*
|
|
|
|
* Now we must make room for whatever extra TCP/IP options are in
|
|
|
|
* the packet.
|
|
|
|
*/
|
|
|
|
optlen = tcp_optlen(tp);
|
|
|
|
|
1999-07-01 12:12:45 +04:00
|
|
|
/*
|
|
|
|
* XXX tp->t_ourmss should have the right size, but without this code
|
|
|
|
* fragmentation will occur... need more investigation
|
|
|
|
*/
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1999-07-01 12:12:45 +04:00
|
|
|
if (inp) {
|
2013-06-05 23:01:26 +04:00
|
|
|
#if defined(IPSEC)
|
2014-05-30 05:39:03 +04:00
|
|
|
if (ipsec_used &&
|
|
|
|
!IPSEC_PCB_SKIP_IPSEC(inp->inp_sp, IPSEC_DIR_OUTBOUND))
|
2004-03-03 08:59:38 +03:00
|
|
|
optlen += ipsec4_hdrsiz_tcp(tp);
|
1999-07-01 12:12:45 +04:00
|
|
|
#endif
|
2001-12-03 04:45:43 +03:00
|
|
|
optlen += ip_optlen(inp);
|
1999-07-01 12:12:45 +04:00
|
|
|
}
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
#ifdef INET6
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
|
|
|
if (in6p && tp->t_family == AF_INET) {
|
2013-06-05 23:01:26 +04:00
|
|
|
#if defined(IPSEC)
|
2014-05-30 05:39:03 +04:00
|
|
|
if (ipsec_used &&
|
|
|
|
!IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND))
|
2004-03-03 08:59:38 +03:00
|
|
|
optlen += ipsec4_hdrsiz_tcp(tp);
|
1999-07-01 12:12:45 +04:00
|
|
|
#endif
|
|
|
|
/* XXX size -= ip_optlen(in6p); */
|
2000-10-17 07:06:42 +04:00
|
|
|
} else
|
|
|
|
#endif
|
|
|
|
if (in6p && tp->t_family == AF_INET6) {
|
2013-06-05 23:01:26 +04:00
|
|
|
#if defined(IPSEC)
|
2014-05-30 05:39:03 +04:00
|
|
|
if (ipsec_used &&
|
|
|
|
!IPSEC_PCB_SKIP_IPSEC(in6p->in6p_sp, IPSEC_DIR_OUTBOUND))
|
2004-03-03 08:59:38 +03:00
|
|
|
optlen += ipsec6_hdrsiz_tcp(tp);
|
1999-07-01 12:12:45 +04:00
|
|
|
#endif
|
2001-12-03 04:45:43 +03:00
|
|
|
optlen += ip6_optlen(in6p);
|
1999-07-01 12:12:45 +04:00
|
|
|
}
|
|
|
|
#endif
|
2001-12-03 04:45:43 +03:00
|
|
|
size -= optlen;
|
1997-09-23 01:49:55 +04:00
|
|
|
|
2004-02-04 08:36:03 +03:00
|
|
|
/* there may not be any room for data if mtu is too small */
|
|
|
|
if (size < 0)
|
|
|
|
return (EMSGSIZE);
|
|
|
|
|
1999-09-23 06:21:30 +04:00
|
|
|
/*
|
|
|
|
* *rxsegsizep holds *estimated* inbound segment size (estimation
|
|
|
|
* assumes that path MTU is the same for both ways). this is only
|
|
|
|
* for silly window avoidance, do not use the value for other purposes.
|
|
|
|
*
|
|
|
|
* ipseclen is subtracted from both sides, this may not be right.
|
|
|
|
* I'm not quite sure about this (could someone comment).
|
|
|
|
*/
|
2001-12-03 04:45:43 +03:00
|
|
|
*txsegsizep = min(tp->t_peermss - optlen, size);
|
2002-08-20 20:29:42 +04:00
|
|
|
/*
|
|
|
|
* Never send more than half a buffer full. This insures that we can
|
|
|
|
* always keep 2 packets on the wire, no matter what SO_SNDBUF is, and
|
2002-09-13 22:26:55 +04:00
|
|
|
* therefore acks will never be delayed unless we run out of data to
|
2002-08-20 20:29:42 +04:00
|
|
|
* transmit.
|
|
|
|
*/
|
|
|
|
if (so)
|
2006-10-02 02:29:20 +04:00
|
|
|
*txsegsizep = min(so->so_snd.sb_hiwat >> 1, *txsegsizep);
|
2001-12-03 04:45:43 +03:00
|
|
|
*rxsegsizep = min(tp->t_ourmss - optlen, size);
|
1997-11-08 05:35:22 +03:00
|
|
|
|
|
|
|
if (*txsegsizep != tp->t_segsz) {
|
1998-04-29 07:44:11 +04:00
|
|
|
/*
|
2002-06-09 20:33:36 +04:00
|
|
|
* If the new segment size is larger, we don't want to
|
1998-04-29 07:44:11 +04:00
|
|
|
* mess up the congestion window, but if it is smaller
|
|
|
|
* we'll have to reduce the congestion window to ensure
|
|
|
|
* that we don't get into trouble with initial windows
|
|
|
|
* and the rest. In any case, if the segment size
|
|
|
|
* has changed, chances are the path has, too, and
|
|
|
|
* our congestion window will be different.
|
1998-03-18 02:50:30 +03:00
|
|
|
*/
|
1998-04-29 07:44:11 +04:00
|
|
|
if (*txsegsizep < tp->t_segsz) {
|
2002-06-09 20:33:36 +04:00
|
|
|
tp->snd_cwnd = max((tp->snd_cwnd / tp->t_segsz)
|
1998-04-29 07:44:11 +04:00
|
|
|
* *txsegsizep, *txsegsizep);
|
2002-06-09 20:33:36 +04:00
|
|
|
tp->snd_ssthresh = max((tp->snd_ssthresh / tp->t_segsz)
|
1998-04-29 07:44:11 +04:00
|
|
|
* *txsegsizep, *txsegsizep);
|
|
|
|
}
|
1997-11-08 05:35:22 +03:00
|
|
|
tp->t_segsz = *txsegsizep;
|
|
|
|
}
|
2004-02-04 08:36:03 +03:00
|
|
|
|
|
|
|
return (0);
|
1997-09-23 01:49:55 +04:00
|
|
|
}
|
|
|
|
|
2001-07-31 06:25:22 +04:00
|
|
|
static
|
|
|
|
#ifndef GPROF
|
2005-12-24 23:45:08 +03:00
|
|
|
inline
|
2001-07-31 06:25:22 +04:00
|
|
|
#endif
|
|
|
|
int
|
|
|
|
tcp_build_datapkt(struct tcpcb *tp, struct socket *so, int off,
|
|
|
|
long len, int hdrlen, struct mbuf **mp)
|
|
|
|
{
|
2003-07-02 23:33:20 +04:00
|
|
|
struct mbuf *m, *m0;
|
2008-04-12 09:58:22 +04:00
|
|
|
uint64_t *tcps;
|
2001-07-31 06:25:22 +04:00
|
|
|
|
2008-04-12 09:58:22 +04:00
|
|
|
tcps = TCP_STAT_GETREF();
|
2001-07-31 06:25:22 +04:00
|
|
|
if (tp->t_force && len == 1)
|
2008-04-12 09:58:22 +04:00
|
|
|
tcps[TCP_STAT_SNDPROBE]++;
|
2001-07-31 06:25:22 +04:00
|
|
|
else if (SEQ_LT(tp->snd_nxt, tp->snd_max)) {
|
2015-02-14 15:57:52 +03:00
|
|
|
tp->t_sndrexmitpack++;
|
2008-04-12 09:58:22 +04:00
|
|
|
tcps[TCP_STAT_SNDREXMITPACK]++;
|
|
|
|
tcps[TCP_STAT_SNDREXMITBYTE] += len;
|
2001-07-31 06:25:22 +04:00
|
|
|
} else {
|
2008-04-12 09:58:22 +04:00
|
|
|
tcps[TCP_STAT_SNDPACK]++;
|
|
|
|
tcps[TCP_STAT_SNDBYTE] += len;
|
2001-07-31 06:25:22 +04:00
|
|
|
}
|
2008-04-12 09:58:22 +04:00
|
|
|
TCP_STAT_PUTREF();
|
2001-07-31 06:25:22 +04:00
|
|
|
#ifdef notyet
|
|
|
|
if ((m = m_copypack(so->so_snd.sb_mb, off,
|
|
|
|
(int)len, max_linkhdr + hdrlen)) == 0)
|
|
|
|
return (ENOBUFS);
|
|
|
|
/*
|
|
|
|
* m_copypack left space for our hdr; use it.
|
|
|
|
*/
|
|
|
|
m->m_len += hdrlen;
|
|
|
|
m->m_data -= hdrlen;
|
|
|
|
#else
|
|
|
|
MGETHDR(m, M_DONTWAIT, MT_HEADER);
|
2002-04-27 05:47:58 +04:00
|
|
|
if (__predict_false(m == NULL))
|
|
|
|
return (ENOBUFS);
|
2003-02-26 09:31:08 +03:00
|
|
|
MCLAIM(m, &tcp_tx_mowner);
|
2002-04-27 05:47:58 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX Because other code assumes headers will fit in
|
|
|
|
* XXX one header mbuf.
|
|
|
|
*
|
|
|
|
* (This code should almost *never* be run.)
|
|
|
|
*/
|
|
|
|
if (__predict_false((max_linkhdr + hdrlen) > MHLEN)) {
|
|
|
|
TCP_OUTPUT_COUNTER_INCR(&tcp_output_bigheader);
|
2001-07-31 06:25:22 +04:00
|
|
|
MCLGET(m, M_DONTWAIT);
|
|
|
|
if ((m->m_flags & M_EXT) == 0) {
|
|
|
|
m_freem(m);
|
2002-04-27 05:47:58 +04:00
|
|
|
return (ENOBUFS);
|
2001-07-31 06:25:22 +04:00
|
|
|
}
|
|
|
|
}
|
2002-04-27 05:47:58 +04:00
|
|
|
|
2001-07-31 06:25:22 +04:00
|
|
|
m->m_data += max_linkhdr;
|
|
|
|
m->m_len = hdrlen;
|
2003-06-29 22:58:26 +04:00
|
|
|
|
2003-07-02 23:33:20 +04:00
|
|
|
/*
|
|
|
|
* To avoid traversing the whole sb_mb chain for correct
|
2003-10-22 01:17:20 +04:00
|
|
|
* data to send, remember last sent mbuf, its offset and
|
|
|
|
* the sent size. When called the next time, see if the
|
|
|
|
* data to send is directly following the previous transfer.
|
|
|
|
* This is important for large TCP windows.
|
2003-07-02 23:33:20 +04:00
|
|
|
*/
|
2003-11-12 13:48:04 +03:00
|
|
|
if (off == 0 || tp->t_lastm == NULL ||
|
|
|
|
(tp->t_lastoff + tp->t_lastlen) != off) {
|
2003-10-23 21:02:23 +04:00
|
|
|
TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_miss);
|
2003-06-29 22:58:26 +04:00
|
|
|
/*
|
2003-07-02 23:33:20 +04:00
|
|
|
* Either a new packet or a retransmit.
|
|
|
|
* Start from the beginning.
|
2003-06-29 22:58:26 +04:00
|
|
|
*/
|
2003-07-02 23:33:20 +04:00
|
|
|
tp->t_lastm = so->so_snd.sb_mb;
|
|
|
|
tp->t_inoff = off;
|
2003-10-22 01:17:20 +04:00
|
|
|
} else {
|
2003-10-23 21:02:23 +04:00
|
|
|
TCP_OUTPUT_COUNTER_INCR(&tcp_output_predict_hit);
|
2003-07-02 23:33:20 +04:00
|
|
|
tp->t_inoff += tp->t_lastlen;
|
2003-10-22 01:17:20 +04:00
|
|
|
}
|
2003-07-02 23:33:20 +04:00
|
|
|
|
|
|
|
/* Traverse forward to next packet */
|
|
|
|
while (tp->t_inoff > 0) {
|
|
|
|
if (tp->t_lastm == NULL)
|
|
|
|
panic("tp->t_lastm == NULL");
|
|
|
|
if (tp->t_inoff < tp->t_lastm->m_len)
|
|
|
|
break;
|
|
|
|
tp->t_inoff -= tp->t_lastm->m_len;
|
|
|
|
tp->t_lastm = tp->t_lastm->m_next;
|
|
|
|
}
|
2003-06-29 22:58:26 +04:00
|
|
|
|
2003-07-02 23:33:20 +04:00
|
|
|
tp->t_lastoff = off;
|
|
|
|
tp->t_lastlen = len;
|
|
|
|
m0 = tp->t_lastm;
|
|
|
|
off = tp->t_inoff;
|
|
|
|
|
|
|
|
if (len <= M_TRAILINGSPACE(m)) {
|
2007-03-04 08:59:00 +03:00
|
|
|
m_copydata(m0, off, (int) len, mtod(m, char *) + hdrlen);
|
2003-07-02 23:33:20 +04:00
|
|
|
m->m_len += len;
|
|
|
|
TCP_OUTPUT_COUNTER_INCR(&tcp_output_copysmall);
|
|
|
|
} else {
|
2007-09-02 07:12:23 +04:00
|
|
|
m->m_next = m_copym(m0, off, (int) len, M_DONTWAIT);
|
2001-07-31 06:25:22 +04:00
|
|
|
if (m->m_next == NULL) {
|
|
|
|
m_freem(m);
|
|
|
|
return (ENOBUFS);
|
|
|
|
}
|
2002-04-27 05:47:58 +04:00
|
|
|
#ifdef TCP_OUTPUT_COUNTERS
|
|
|
|
if (m->m_next->m_flags & M_EXT)
|
|
|
|
TCP_OUTPUT_COUNTER_INCR(&tcp_output_refbig);
|
|
|
|
else
|
|
|
|
TCP_OUTPUT_COUNTER_INCR(&tcp_output_copybig);
|
|
|
|
#endif /* TCP_OUTPUT_COUNTERS */
|
2001-07-31 06:25:22 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
*mp = m;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1993-03-21 12:45:37 +03:00
|
|
|
/*
|
|
|
|
* Tcp output routine: figure out what should be sent and send it.
|
|
|
|
*/
|
1994-01-09 02:07:16 +03:00
|
|
|
int
|
2005-02-04 02:39:32 +03:00
|
|
|
tcp_output(struct tcpcb *tp)
|
1993-03-21 12:45:37 +03:00
|
|
|
{
|
2016-12-08 08:16:33 +03:00
|
|
|
struct rtentry *rt = NULL;
|
1999-07-01 12:12:45 +04:00
|
|
|
struct socket *so;
|
|
|
|
struct route *ro;
|
1999-01-20 06:39:54 +03:00
|
|
|
long len, win;
|
1993-03-21 12:45:37 +03:00
|
|
|
int off, flags, error;
|
2000-03-30 16:51:13 +04:00
|
|
|
struct mbuf *m;
|
1999-07-01 12:12:45 +04:00
|
|
|
struct ip *ip;
|
|
|
|
#ifdef INET6
|
|
|
|
struct ip6_hdr *ip6;
|
|
|
|
#endif
|
2000-03-30 16:51:13 +04:00
|
|
|
struct tcphdr *th;
|
2017-01-03 18:07:59 +03:00
|
|
|
u_char opt[MAX_TCPOPTLEN], *optp;
|
|
|
|
#define OPT_FITS(more) ((optlen + (more)) <= sizeof(opt))
|
2005-07-19 21:00:02 +04:00
|
|
|
unsigned optlen, hdrlen, packetlen;
|
2005-03-16 03:39:56 +03:00
|
|
|
unsigned int sack_numblks;
|
1997-10-08 20:32:48 +04:00
|
|
|
int idle, sendalot, txsegsize, rxsegsize;
|
2005-03-16 03:38:27 +03:00
|
|
|
int txsegsize_nosack;
|
1998-10-05 01:33:52 +04:00
|
|
|
int maxburst = TCP_MAXBURST;
|
1999-07-01 12:12:45 +04:00
|
|
|
int af; /* address family on the wire */
|
|
|
|
int iphdrlen;
|
2006-11-23 22:41:58 +03:00
|
|
|
int has_tso4, has_tso6;
|
2005-03-16 03:38:27 +03:00
|
|
|
int has_tso, use_tso;
|
2007-02-22 01:59:35 +03:00
|
|
|
bool alwaysfrag;
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
int sack_rxmit;
|
|
|
|
int sack_bytes_rxmt;
|
2011-03-21 23:39:32 +03:00
|
|
|
int ecn_tos;
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
struct sackhole *p;
|
Initial commit of a port of the FreeBSD implementation of RFC 2385
(MD5 signatures for TCP, as used with BGP). Credit for original
FreeBSD code goes to Bruce M. Simpson, with FreeBSD sponsorship
credited to sentex.net. Shortening of the setsockopt() name
attributed to Vincent Jardin.
This commit is a minimal, working version of the FreeBSD code, as
MFC'ed to FreeBSD-4. It has received minimal testing with a ttcp
modified to set the TCP-MD5 option; BMS's additions to tcpdump-current
(tcpdump -M) confirm that the MD5 signatures are correct. Committed
as-is for further testing between a NetBSD BGP speaker (e.g., quagga)
and industry-standard BGP speakers (e.g., Cisco, Juniper).
NOTE: This version has two potential flaws. First, I do see any code
that verifies recieved TCP-MD5 signatures. Second, the TCP-MD5
options are internally padded and assumed to be 32-bit aligned. A more
space-efficient scheme is to pack all TCP options densely (and
possibly unaligned) into the TCP header ; then do one final padding to
a 4-byte boundary. Pre-existing comments note that accounting for
TCP-option space when we add SACK is yet to be done. For now, I'm
punting on that; we can solve it properly, in a way that will handle
SACK blocks, as a separate exercise.
In case a pullup to NetBSD-2 is requested, this adds sys/netipsec/xform_tcp.c
,and modifies:
sys/net/pfkeyv2.h,v 1.15
sys/netinet/files.netinet,v 1.5
sys/netinet/ip.h,v 1.25
sys/netinet/tcp.h,v 1.15
sys/netinet/tcp_input.c,v 1.200
sys/netinet/tcp_output.c,v 1.109
sys/netinet/tcp_subr.c,v 1.165
sys/netinet/tcp_usrreq.c,v 1.89
sys/netinet/tcp_var.h,v 1.109
sys/netipsec/files.netipsec,v 1.3
sys/netipsec/ipsec.c,v 1.11
sys/netipsec/ipsec.h,v 1.7
sys/netipsec/key.c,v 1.11
share/man/man4/tcp.4,v 1.16
lib/libipsec/pfkey.c,v 1.20
lib/libipsec/pfkey_dump.c,v 1.17
lib/libipsec/policy_token.l,v 1.8
sbin/setkey/parse.y,v 1.14
sbin/setkey/setkey.8,v 1.27
sbin/setkey/token.l,v 1.15
Note that the preceding two revisions to tcp.4 will be
required to cleanly apply this diff.
2004-04-26 02:25:03 +04:00
|
|
|
#ifdef TCP_SIGNATURE
|
|
|
|
int sigoff = 0;
|
|
|
|
#endif
|
2008-04-12 09:58:22 +04:00
|
|
|
uint64_t *tcps;
|
1999-07-01 12:12:45 +04:00
|
|
|
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (tp->t_inpcb && tp->t_in6pcb)
|
|
|
|
panic("tcp_output: both t_inpcb and t_in6pcb are set");
|
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
so = NULL;
|
|
|
|
ro = NULL;
|
|
|
|
if (tp->t_inpcb) {
|
|
|
|
so = tp->t_inpcb->inp_socket;
|
|
|
|
ro = &tp->t_inpcb->inp_route;
|
|
|
|
}
|
|
|
|
#ifdef INET6
|
|
|
|
else if (tp->t_in6pcb) {
|
|
|
|
so = tp->t_in6pcb->in6p_socket;
|
Eliminate address family-specific route caches (struct route, struct
route_in6, struct route_iso), replacing all caches with a struct
route.
The principle benefit of this change is that all of the protocol
families can benefit from route cache-invalidation, which is
necessary for correct routing. Route-cache invalidation fixes an
ancient PR, kern/3508, at long last; it fixes various other PRs,
also.
Discussions with and ideas from Joerg Sonnenberger influenced this
work tremendously. Of course, all design oversights and bugs are
mine.
DETAILS
1 I added to each address family a pool of sockaddrs. I have
introduced routines for allocating, copying, and duplicating,
and freeing sockaddrs:
struct sockaddr *sockaddr_alloc(sa_family_t af, int flags);
struct sockaddr *sockaddr_copy(struct sockaddr *dst,
const struct sockaddr *src);
struct sockaddr *sockaddr_dup(const struct sockaddr *src, int flags);
void sockaddr_free(struct sockaddr *sa);
sockaddr_alloc() returns either a sockaddr from the pool belonging
to the specified family, or NULL if the pool is exhausted. The
returned sockaddr has the right size for that family; sa_family
and sa_len fields are initialized to the family and sockaddr
length---e.g., sa_family = AF_INET and sa_len = sizeof(struct
sockaddr_in). sockaddr_free() puts the given sockaddr back into
its family's pool.
sockaddr_dup() and sockaddr_copy() work analogously to strdup()
and strcpy(), respectively. sockaddr_copy() KASSERTs that the
family of the destination and source sockaddrs are alike.
The 'flags' argumet for sockaddr_alloc() and sockaddr_dup() is
passed directly to pool_get(9).
2 I added routines for initializing sockaddrs in each address
family, sockaddr_in_init(), sockaddr_in6_init(), sockaddr_iso_init(),
etc. They are fairly self-explanatory.
3 structs route_in6 and route_iso are no more. All protocol families
use struct route. I have changed the route cache, 'struct route',
so that it does not contain storage space for a sockaddr. Instead,
struct route points to a sockaddr coming from the pool the sockaddr
belongs to. I added a new method to struct route, rtcache_setdst(),
for setting the cache destination:
int rtcache_setdst(struct route *, const struct sockaddr *);
rtcache_setdst() returns 0 on success, or ENOMEM if no memory is
available to create the sockaddr storage.
It is now possible for rtcache_getdst() to return NULL if, say,
rtcache_setdst() failed. I check the return value for NULL
everywhere in the kernel.
4 Each routing domain (struct domain) has a list of live route
caches, dom_rtcache. rtflushall(sa_family_t af) looks up the
domain indicated by 'af', walks the domain's list of route caches
and invalidates each one.
2007-05-03 00:40:22 +04:00
|
|
|
ro = &tp->t_in6pcb->in6p_route;
|
1999-07-01 12:12:45 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
switch (af = tp->t_family) {
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1999-07-01 12:12:45 +04:00
|
|
|
case AF_INET:
|
|
|
|
if (tp->t_inpcb)
|
|
|
|
break;
|
|
|
|
#ifdef INET6
|
|
|
|
/* mapped addr case */
|
|
|
|
if (tp->t_in6pcb)
|
|
|
|
break;
|
|
|
|
#endif
|
2004-02-04 08:36:03 +03:00
|
|
|
return (EINVAL);
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
#ifdef INET6
|
|
|
|
case AF_INET6:
|
|
|
|
if (tp->t_in6pcb)
|
|
|
|
break;
|
2004-02-04 08:36:03 +03:00
|
|
|
return (EINVAL);
|
1999-07-01 12:12:45 +04:00
|
|
|
#endif
|
|
|
|
default:
|
2004-02-04 08:36:03 +03:00
|
|
|
return (EAFNOSUPPORT);
|
1999-07-01 12:12:45 +04:00
|
|
|
}
|
1997-09-23 01:49:55 +04:00
|
|
|
|
2006-11-23 22:41:58 +03:00
|
|
|
if (tcp_segsize(tp, &txsegsize, &rxsegsize, &alwaysfrag))
|
2004-02-04 08:36:03 +03:00
|
|
|
return (EMSGSIZE);
|
1993-03-21 12:45:37 +03:00
|
|
|
|
|
|
|
idle = (tp->snd_max == tp->snd_una);
|
1998-04-02 02:15:52 +04:00
|
|
|
|
2005-03-06 03:35:07 +03:00
|
|
|
/*
|
|
|
|
* Determine if we can use TCP segmentation offload:
|
|
|
|
* - If we're using IPv4
|
|
|
|
* - If there is not an IPsec policy that prevents it
|
|
|
|
* - If the interface can do it
|
|
|
|
*/
|
2007-02-22 09:16:03 +03:00
|
|
|
has_tso4 = has_tso6 = false;
|
2006-11-23 22:41:58 +03:00
|
|
|
#if defined(INET)
|
|
|
|
has_tso4 = tp->t_inpcb != NULL &&
|
2013-06-05 23:01:26 +04:00
|
|
|
#if defined(IPSEC)
|
2014-10-21 17:44:47 +04:00
|
|
|
(!ipsec_used || IPSEC_PCB_SKIP_IPSEC(tp->t_inpcb->inp_sp,
|
|
|
|
IPSEC_DIR_OUTBOUND)) &&
|
2005-03-06 03:35:07 +03:00
|
|
|
#endif
|
2014-05-30 05:39:03 +04:00
|
|
|
(rt = rtcache_validate(&tp->t_inpcb->inp_route)) != NULL &&
|
|
|
|
(rt->rt_ifp->if_capenable & IFCAP_TSOv4) != 0;
|
2016-12-08 08:16:33 +03:00
|
|
|
if (rt != NULL) {
|
|
|
|
rtcache_unref(rt, &tp->t_inpcb->inp_route);
|
|
|
|
rt = NULL;
|
|
|
|
}
|
2006-11-23 22:41:58 +03:00
|
|
|
#endif /* defined(INET) */
|
|
|
|
#if defined(INET6)
|
|
|
|
has_tso6 = tp->t_in6pcb != NULL &&
|
2013-06-05 23:01:26 +04:00
|
|
|
#if defined(IPSEC)
|
2014-10-21 17:44:47 +04:00
|
|
|
(!ipsec_used || IPSEC_PCB_SKIP_IPSEC(tp->t_in6pcb->in6p_sp,
|
|
|
|
IPSEC_DIR_OUTBOUND)) &&
|
2006-11-23 22:41:58 +03:00
|
|
|
#endif
|
2014-05-30 05:39:03 +04:00
|
|
|
(rt = rtcache_validate(&tp->t_in6pcb->in6p_route)) != NULL &&
|
|
|
|
(rt->rt_ifp->if_capenable & IFCAP_TSOv6) != 0;
|
2016-12-08 08:16:33 +03:00
|
|
|
if (rt != NULL)
|
|
|
|
rtcache_unref(rt, &tp->t_in6pcb->in6p_route);
|
2006-11-23 22:41:58 +03:00
|
|
|
#endif /* defined(INET6) */
|
|
|
|
has_tso = (has_tso4 || has_tso6) && !alwaysfrag;
|
2005-03-06 03:35:07 +03:00
|
|
|
|
1998-07-18 02:52:01 +04:00
|
|
|
/*
|
|
|
|
* Restart Window computation. From draft-floyd-incr-init-win-03:
|
|
|
|
*
|
|
|
|
* Optionally, a TCP MAY set the restart window to the
|
|
|
|
* minimum of the value used for the initial window and
|
|
|
|
* the current value of cwnd (in other words, using a
|
|
|
|
* larger value for the restart window should never increase
|
|
|
|
* the size of cwnd).
|
|
|
|
*/
|
1998-04-02 02:15:52 +04:00
|
|
|
if (tcp_cwm) {
|
1993-03-21 12:45:37 +03:00
|
|
|
/*
|
1998-04-02 02:15:52 +04:00
|
|
|
* Hughes/Touch/Heidemann Congestion Window Monitoring.
|
|
|
|
* Count the number of packets currently pending
|
|
|
|
* acknowledgement, and limit our congestion window
|
1998-05-02 05:00:24 +04:00
|
|
|
* to a pre-determined allowed burst size plus that count.
|
1998-04-02 02:15:52 +04:00
|
|
|
* This prevents bursting once all pending packets have
|
|
|
|
* been acknowledged (i.e. transmission is idle).
|
1998-07-18 03:00:02 +04:00
|
|
|
*
|
|
|
|
* XXX Link this to Initial Window?
|
1993-03-21 12:45:37 +03:00
|
|
|
*/
|
1998-04-02 02:15:52 +04:00
|
|
|
tp->snd_cwnd = min(tp->snd_cwnd,
|
1998-04-30 22:27:20 +04:00
|
|
|
(tcp_cwm_burstsize * txsegsize) +
|
1998-04-02 02:15:52 +04:00
|
|
|
(tp->snd_nxt - tp->snd_una));
|
|
|
|
} else {
|
2001-09-10 19:23:09 +04:00
|
|
|
if (idle && (tcp_now - tp->t_rcvtime) >= tp->t_rxtcur) {
|
1998-04-02 02:15:52 +04:00
|
|
|
/*
|
|
|
|
* We have been idle for "a while" and no acks are
|
|
|
|
* expected to clock out any data we send --
|
|
|
|
* slow start to get ack "clock" running again.
|
|
|
|
*/
|
2003-03-01 07:40:27 +03:00
|
|
|
int ss = tcp_init_win;
|
|
|
|
#ifdef INET
|
|
|
|
if (tp->t_inpcb &&
|
|
|
|
in_localaddr(tp->t_inpcb->inp_faddr))
|
|
|
|
ss = tcp_init_win_local;
|
|
|
|
#endif
|
|
|
|
#ifdef INET6
|
|
|
|
if (tp->t_in6pcb &&
|
|
|
|
in6_localaddr(&tp->t_in6pcb->in6p_faddr))
|
|
|
|
ss = tcp_init_win_local;
|
|
|
|
#endif
|
1998-07-18 02:52:01 +04:00
|
|
|
tp->snd_cwnd = min(tp->snd_cwnd,
|
2003-03-01 07:40:27 +03:00
|
|
|
TCP_INITIAL_WINDOW(ss, txsegsize));
|
1998-04-02 02:15:52 +04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2005-03-16 03:38:27 +03:00
|
|
|
txsegsize_nosack = txsegsize;
|
1993-03-21 12:45:37 +03:00
|
|
|
again:
|
2011-03-21 23:39:32 +03:00
|
|
|
ecn_tos = 0;
|
2005-03-16 03:39:56 +03:00
|
|
|
use_tso = has_tso;
|
2006-10-08 15:10:59 +04:00
|
|
|
if ((tp->t_flags & (TF_ECN_SND_CWR|TF_ECN_SND_ECE)) != 0) {
|
|
|
|
/* don't duplicate CWR/ECE. */
|
|
|
|
use_tso = 0;
|
|
|
|
}
|
2005-03-30 00:09:24 +04:00
|
|
|
TCP_REASS_LOCK(tp);
|
2005-03-16 03:39:56 +03:00
|
|
|
sack_numblks = tcp_sack_numblks(tp);
|
|
|
|
if (sack_numblks) {
|
2006-10-08 15:01:46 +04:00
|
|
|
int sackoptlen;
|
|
|
|
|
|
|
|
sackoptlen = TCP_SACK_OPTLEN(sack_numblks);
|
|
|
|
if (sackoptlen > txsegsize_nosack) {
|
|
|
|
sack_numblks = 0; /* give up SACK */
|
|
|
|
txsegsize = txsegsize_nosack;
|
|
|
|
} else {
|
|
|
|
if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
|
|
|
|
/* don't duplicate D-SACK. */
|
|
|
|
use_tso = 0;
|
|
|
|
}
|
|
|
|
txsegsize = txsegsize_nosack - sackoptlen;
|
2005-03-16 03:39:56 +03:00
|
|
|
}
|
2005-03-16 03:38:27 +03:00
|
|
|
} else {
|
2005-03-16 03:39:56 +03:00
|
|
|
txsegsize = txsegsize_nosack;
|
2005-03-16 03:38:27 +03:00
|
|
|
}
|
|
|
|
|
1998-04-02 02:15:52 +04:00
|
|
|
/*
|
|
|
|
* Determine length of data that should be transmitted, and
|
|
|
|
* flags that should be used. If there is some data or critical
|
|
|
|
* controls (SYN, RST) to send, then transmit; otherwise,
|
|
|
|
* investigate further.
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
*
|
|
|
|
* Readjust SACK information to avoid resending duplicate data.
|
1998-04-02 02:15:52 +04:00
|
|
|
*/
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
if (TCP_SACK_ENABLED(tp) && SEQ_LT(tp->snd_nxt, tp->snd_max))
|
|
|
|
tcp_sack_adjust(tp);
|
1993-03-21 12:45:37 +03:00
|
|
|
sendalot = 0;
|
|
|
|
off = tp->snd_nxt - tp->snd_una;
|
|
|
|
win = min(tp->snd_wnd, tp->snd_cwnd);
|
|
|
|
|
1994-05-13 10:02:48 +04:00
|
|
|
flags = tcp_outflags[tp->t_state];
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Send any SACK-generated retransmissions. If we're explicitly trying
|
|
|
|
* to send out new data (when sendalot is 1), bypass this function.
|
|
|
|
* If we retransmit in fast recovery mode, decrement snd_cwnd, since
|
|
|
|
* we're replacing a (future) new transmission with a retransmission
|
|
|
|
* now, and we previously incremented snd_cwnd in tcp_input().
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* Still in sack recovery , reset rxmit flag to zero.
|
|
|
|
*/
|
|
|
|
sack_rxmit = 0;
|
|
|
|
sack_bytes_rxmt = 0;
|
|
|
|
len = 0;
|
|
|
|
p = NULL;
|
2005-03-06 03:48:52 +03:00
|
|
|
do {
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
long cwin;
|
2005-03-06 03:48:52 +03:00
|
|
|
if (!TCP_SACK_ENABLED(tp))
|
|
|
|
break;
|
2005-03-06 03:52:25 +03:00
|
|
|
if (tp->t_partialacks < 0)
|
2005-03-06 03:48:52 +03:00
|
|
|
break;
|
|
|
|
p = tcp_sack_output(tp, &sack_bytes_rxmt);
|
|
|
|
if (p == NULL)
|
|
|
|
break;
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
|
|
|
|
cwin = min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt;
|
|
|
|
if (cwin < 0)
|
|
|
|
cwin = 0;
|
|
|
|
/* Do not retransmit SACK segments beyond snd_recover */
|
|
|
|
if (SEQ_GT(p->end, tp->snd_recover)) {
|
|
|
|
/*
|
|
|
|
* (At least) part of sack hole extends beyond
|
|
|
|
* snd_recover. Check to see if we can rexmit data
|
|
|
|
* for this hole.
|
|
|
|
*/
|
|
|
|
if (SEQ_GEQ(p->rxmit, tp->snd_recover)) {
|
|
|
|
/*
|
|
|
|
* Can't rexmit any more data for this hole.
|
|
|
|
* That data will be rexmitted in the next
|
|
|
|
* sack recovery episode, when snd_recover
|
|
|
|
* moves past p->rxmit.
|
|
|
|
*/
|
|
|
|
p = NULL;
|
2005-03-06 03:48:52 +03:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* Can rexmit part of the current hole */
|
|
|
|
len = ((long)ulmin(cwin, tp->snd_recover - p->rxmit));
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
} else
|
|
|
|
len = ((long)ulmin(cwin, p->end - p->rxmit));
|
|
|
|
off = p->rxmit - tp->snd_una;
|
2005-05-08 08:51:05 +04:00
|
|
|
if (off + len > so->so_snd.sb_cc) {
|
|
|
|
/* 1 for TH_FIN */
|
|
|
|
KASSERT(off + len == so->so_snd.sb_cc + 1);
|
|
|
|
KASSERT(p->rxmit + len == tp->snd_max);
|
|
|
|
len = so->so_snd.sb_cc - off;
|
|
|
|
}
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
if (len > 0) {
|
|
|
|
sack_rxmit = 1;
|
|
|
|
sendalot = 1;
|
|
|
|
}
|
2005-03-06 06:41:36 +03:00
|
|
|
} while (/*CONSTCOND*/0);
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
|
1993-03-21 12:45:37 +03:00
|
|
|
/*
|
|
|
|
* If in persist timeout with window of 0, send 1 byte.
|
|
|
|
* Otherwise, if window is small but nonzero
|
|
|
|
* and timer expired, we will send what we can
|
|
|
|
* and go to transmit state.
|
|
|
|
*/
|
|
|
|
if (tp->t_force) {
|
1994-05-13 10:02:48 +04:00
|
|
|
if (win == 0) {
|
|
|
|
/*
|
|
|
|
* If we still have some data to send, then
|
|
|
|
* clear the FIN bit. Usually this would
|
|
|
|
* happen below when it realizes that we
|
|
|
|
* aren't sending all the data. However,
|
|
|
|
* if we have exactly 1 byte of unset data,
|
|
|
|
* then it won't clear the FIN bit below,
|
|
|
|
* and if we are in persist state, we wind
|
|
|
|
* up sending the packet without recording
|
|
|
|
* that we sent the FIN bit.
|
|
|
|
*
|
|
|
|
* We can't just blindly clear the FIN bit,
|
|
|
|
* because if we don't have any more data
|
|
|
|
* to send then the probe will be the FIN
|
|
|
|
* itself.
|
|
|
|
*/
|
|
|
|
if (off < so->so_snd.sb_cc)
|
|
|
|
flags &= ~TH_FIN;
|
1993-03-21 12:45:37 +03:00
|
|
|
win = 1;
|
1994-05-13 10:02:48 +04:00
|
|
|
} else {
|
1998-05-06 05:21:20 +04:00
|
|
|
TCP_TIMER_DISARM(tp, TCPT_PERSIST);
|
1993-03-21 12:45:37 +03:00
|
|
|
tp->t_rxtshift = 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2006-10-07 20:16:42 +04:00
|
|
|
if (sack_rxmit == 0) {
|
2006-10-17 15:11:40 +04:00
|
|
|
if (TCP_SACK_ENABLED(tp) && tp->t_partialacks >= 0) {
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
long cwin;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We are inside of a SACK recovery episode and are
|
|
|
|
* sending new data, having retransmitted all the
|
|
|
|
* data possible in the scoreboard.
|
|
|
|
*/
|
2005-05-08 08:48:47 +04:00
|
|
|
if (tp->snd_wnd < so->so_snd.sb_cc) {
|
|
|
|
len = tp->snd_wnd - off;
|
|
|
|
flags &= ~TH_FIN;
|
|
|
|
} else {
|
|
|
|
len = so->so_snd.sb_cc - off;
|
|
|
|
}
|
|
|
|
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
/*
|
|
|
|
* From FreeBSD:
|
|
|
|
* Don't remove this (len > 0) check !
|
|
|
|
* We explicitly check for len > 0 here (although it
|
|
|
|
* isn't really necessary), to work around a gcc
|
|
|
|
* optimization issue - to force gcc to compute
|
|
|
|
* len above. Without this check, the computation
|
|
|
|
* of len is bungled by the optimizer.
|
|
|
|
*/
|
|
|
|
if (len > 0) {
|
|
|
|
cwin = tp->snd_cwnd -
|
2006-10-07 20:16:42 +04:00
|
|
|
(tp->snd_nxt - tp->sack_newdata) -
|
|
|
|
sack_bytes_rxmt;
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
if (cwin < 0)
|
|
|
|
cwin = 0;
|
2005-05-08 08:48:47 +04:00
|
|
|
if (cwin < len) {
|
|
|
|
len = cwin;
|
|
|
|
flags &= ~TH_FIN;
|
|
|
|
}
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
}
|
|
|
|
} else if (win < so->so_snd.sb_cc) {
|
|
|
|
len = win - off;
|
|
|
|
flags &= ~TH_FIN;
|
2006-10-07 20:16:42 +04:00
|
|
|
} else {
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
len = so->so_snd.sb_cc - off;
|
2006-10-07 20:16:42 +04:00
|
|
|
}
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
}
|
1993-03-21 12:45:37 +03:00
|
|
|
|
|
|
|
if (len < 0) {
|
|
|
|
/*
|
|
|
|
* If FIN has been sent but not acked,
|
|
|
|
* but we haven't been called to retransmit,
|
|
|
|
* len will be -1. Otherwise, window shrank
|
|
|
|
* after we sent into it. If window shrank to 0,
|
1997-12-17 08:59:32 +03:00
|
|
|
* cancel pending retransmit, pull snd_nxt back
|
|
|
|
* to (closed) window, and set the persist timer
|
|
|
|
* if it isn't already going. If the window didn't
|
|
|
|
* close completely, just wait for an ACK.
|
1998-07-21 14:46:00 +04:00
|
|
|
*
|
|
|
|
* If we have a pending FIN, either it has already been
|
|
|
|
* transmitted or it is outside the window, so drop it.
|
|
|
|
* If the FIN has been transmitted, but this is not a
|
|
|
|
* retransmission, then len must be -1. Therefore we also
|
|
|
|
* prevent here the sending of `gratuitous FINs'. This
|
|
|
|
* eliminates the need to check for that case below (e.g.
|
|
|
|
* to back up snd_nxt before the FIN so that the sequence
|
|
|
|
* number is correct).
|
1993-03-21 12:45:37 +03:00
|
|
|
*/
|
|
|
|
len = 0;
|
1998-07-21 14:46:00 +04:00
|
|
|
flags &= ~TH_FIN;
|
1993-03-21 12:45:37 +03:00
|
|
|
if (win == 0) {
|
1998-05-06 05:21:20 +04:00
|
|
|
TCP_TIMER_DISARM(tp, TCPT_REXMT);
|
1997-12-17 08:59:32 +03:00
|
|
|
tp->t_rxtshift = 0;
|
1993-03-21 12:45:37 +03:00
|
|
|
tp->snd_nxt = tp->snd_una;
|
1998-05-06 05:21:20 +04:00
|
|
|
if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0)
|
1997-12-17 08:59:32 +03:00
|
|
|
tcp_setpersist(tp);
|
1993-03-21 12:45:37 +03:00
|
|
|
}
|
|
|
|
}
|
2007-08-02 06:42:40 +04:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Automatic sizing enables the performance of large buffers
|
|
|
|
* and most of the efficiency of small ones by only allocating
|
|
|
|
* space when it is needed.
|
|
|
|
*
|
|
|
|
* The criteria to step up the send buffer one notch are:
|
|
|
|
* 1. receive window of remote host is larger than send buffer
|
|
|
|
* (with a fudge factor of 5/4th);
|
|
|
|
* 2. send buffer is filled to 7/8th with data (so we actually
|
|
|
|
* have data to make use of it);
|
|
|
|
* 3. send buffer fill has not hit maximal automatic size;
|
|
|
|
* 4. our send window (slow start and cogestion controlled) is
|
|
|
|
* larger than sent but unacknowledged data in send buffer.
|
|
|
|
*
|
|
|
|
* The remote host receive window scaling factor may limit the
|
|
|
|
* growing of the send buffer before it reaches its allowed
|
|
|
|
* maximum.
|
|
|
|
*
|
|
|
|
* It scales directly with slow start or congestion window
|
|
|
|
* and does at most one step per received ACK. This fast
|
|
|
|
* scaling has the drawback of growing the send buffer beyond
|
|
|
|
* what is strictly necessary to make full use of a given
|
|
|
|
* delay*bandwith product. However testing has shown this not
|
|
|
|
* to be much of an problem. At worst we are trading wasting
|
|
|
|
* of available bandwith (the non-use of it) for wasting some
|
|
|
|
* socket buffer memory.
|
|
|
|
*
|
|
|
|
* TODO: Shrink send buffer during idle periods together
|
|
|
|
* with congestion window. Requires another timer.
|
|
|
|
*/
|
|
|
|
if (tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) {
|
|
|
|
if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat &&
|
|
|
|
so->so_snd.sb_cc >= (so->so_snd.sb_hiwat / 8 * 7) &&
|
|
|
|
so->so_snd.sb_cc < tcp_autosndbuf_max &&
|
|
|
|
win >= (so->so_snd.sb_cc - (tp->snd_nxt - tp->snd_una))) {
|
|
|
|
if (!sbreserve(&so->so_snd,
|
|
|
|
min(so->so_snd.sb_hiwat + tcp_autosndbuf_inc,
|
|
|
|
tcp_autosndbuf_max), so))
|
|
|
|
so->so_snd.sb_flags &= ~SB_AUTOSIZE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
1997-10-08 20:32:48 +04:00
|
|
|
if (len > txsegsize) {
|
2005-03-06 03:35:07 +03:00
|
|
|
if (use_tso) {
|
|
|
|
/*
|
|
|
|
* Truncate TSO transfers to IP_MAXPACKET, and make
|
|
|
|
* sure that we send equal size transfers down the
|
|
|
|
* stack (rather than big-small-big-small-...).
|
|
|
|
*/
|
2006-11-24 02:12:59 +03:00
|
|
|
#ifdef INET6
|
2011-04-14 20:08:53 +04:00
|
|
|
CTASSERT(IPV6_MAXPACKET == IP_MAXPACKET);
|
2006-11-23 22:41:58 +03:00
|
|
|
#endif
|
2005-03-06 03:35:07 +03:00
|
|
|
len = (min(len, IP_MAXPACKET) / txsegsize) * txsegsize;
|
2005-03-12 10:53:08 +03:00
|
|
|
if (len <= txsegsize) {
|
|
|
|
use_tso = 0;
|
|
|
|
}
|
2005-03-06 03:35:07 +03:00
|
|
|
} else
|
|
|
|
len = txsegsize;
|
1995-01-23 23:18:35 +03:00
|
|
|
flags &= ~TH_FIN;
|
1993-03-21 12:45:37 +03:00
|
|
|
sendalot = 1;
|
2005-03-06 03:35:07 +03:00
|
|
|
} else
|
|
|
|
use_tso = 0;
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
if (sack_rxmit) {
|
|
|
|
if (SEQ_LT(p->rxmit + len, tp->snd_una + so->so_snd.sb_cc))
|
|
|
|
flags &= ~TH_FIN;
|
|
|
|
}
|
1993-03-21 12:45:37 +03:00
|
|
|
|
|
|
|
win = sbspace(&so->so_rcv);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Sender silly window avoidance. If connection is idle
|
|
|
|
* and can send all data, a maximum segment,
|
|
|
|
* at least a maximum default-size segment do it,
|
|
|
|
* or are forced, do it; otherwise don't bother.
|
|
|
|
* If peer's buffer is tiny, then send
|
|
|
|
* when window is at least half open.
|
|
|
|
* If retransmitting (possibly after persist timer forced us
|
|
|
|
* to send into a small window), then must resend.
|
|
|
|
*/
|
|
|
|
if (len) {
|
2005-03-06 03:35:07 +03:00
|
|
|
if (len >= txsegsize)
|
1993-03-21 12:45:37 +03:00
|
|
|
goto send;
|
1998-12-16 03:33:14 +03:00
|
|
|
if ((so->so_state & SS_MORETOCOME) == 0 &&
|
|
|
|
((idle || tp->t_flags & TF_NODELAY) &&
|
|
|
|
len + off >= so->so_snd.sb_cc))
|
1993-03-21 12:45:37 +03:00
|
|
|
goto send;
|
|
|
|
if (tp->t_force)
|
|
|
|
goto send;
|
|
|
|
if (len >= tp->max_sndwnd / 2)
|
|
|
|
goto send;
|
|
|
|
if (SEQ_LT(tp->snd_nxt, tp->snd_max))
|
|
|
|
goto send;
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
if (sack_rxmit)
|
|
|
|
goto send;
|
1993-03-21 12:45:37 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
1997-10-08 20:32:48 +04:00
|
|
|
* Compare available window to amount of window known to peer
|
|
|
|
* (as advertised window less next expected input). If the
|
|
|
|
* difference is at least twice the size of the largest segment
|
|
|
|
* we expect to receive (i.e. two segments) or at least 50% of
|
|
|
|
* the maximum possible window, then want to send a window update
|
|
|
|
* to peer.
|
1993-03-21 12:45:37 +03:00
|
|
|
*/
|
|
|
|
if (win > 0) {
|
2002-06-09 20:33:36 +04:00
|
|
|
/*
|
1994-05-13 10:02:48 +04:00
|
|
|
* "adv" is the amount we can increase the window,
|
|
|
|
* taking into account that we are limited by
|
|
|
|
* TCP_MAXWIN << tp->rcv_scale.
|
|
|
|
*/
|
|
|
|
long adv = min(win, (long)TCP_MAXWIN << tp->rcv_scale) -
|
|
|
|
(tp->rcv_adv - tp->rcv_nxt);
|
1993-03-21 12:45:37 +03:00
|
|
|
|
2015-04-27 19:50:17 +03:00
|
|
|
/*
|
|
|
|
* If the new window size ends up being the same as the old
|
|
|
|
* size when it is scaled, then don't force a window update.
|
|
|
|
*/
|
|
|
|
if ((tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale ==
|
|
|
|
(adv + tp->rcv_adv - tp->rcv_nxt) >> tp->rcv_scale)
|
|
|
|
goto dontupdate;
|
1997-10-08 20:32:48 +04:00
|
|
|
if (adv >= (long) (2 * rxsegsize))
|
1993-03-21 12:45:37 +03:00
|
|
|
goto send;
|
|
|
|
if (2 * adv >= (long) so->so_rcv.sb_hiwat)
|
|
|
|
goto send;
|
|
|
|
}
|
2015-04-27 19:50:17 +03:00
|
|
|
dontupdate:
|
1993-03-21 12:45:37 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Send if we owe peer an ACK.
|
|
|
|
*/
|
|
|
|
if (tp->t_flags & TF_ACKNOW)
|
|
|
|
goto send;
|
1998-07-21 14:46:00 +04:00
|
|
|
if (flags & (TH_SYN|TH_FIN|TH_RST))
|
1993-03-21 12:45:37 +03:00
|
|
|
goto send;
|
|
|
|
if (SEQ_GT(tp->snd_up, tp->snd_una))
|
|
|
|
goto send;
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
/*
|
|
|
|
* In SACK, it is possible for tcp_output to fail to send a segment
|
|
|
|
* after the retransmission timer has been turned off. Make sure
|
|
|
|
* that the retransmission timer is set.
|
|
|
|
*/
|
|
|
|
if (TCP_SACK_ENABLED(tp) && SEQ_GT(tp->snd_max, tp->snd_una) &&
|
|
|
|
!TCP_TIMER_ISARMED(tp, TCPT_REXMT) &&
|
|
|
|
!TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
|
|
|
|
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
|
|
|
|
goto just_return;
|
|
|
|
}
|
1993-03-21 12:45:37 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* TCP window updates are not reliable, rather a polling protocol
|
|
|
|
* using ``persist'' packets is used to insure receipt of window
|
|
|
|
* updates. The three ``states'' for the output side are:
|
|
|
|
* idle not doing retransmits or persists
|
|
|
|
* persisting to move a small or zero window
|
|
|
|
* (re)transmitting and thereby not persisting
|
|
|
|
*
|
|
|
|
* tp->t_timer[TCPT_PERSIST]
|
|
|
|
* is set when we are in persist state.
|
|
|
|
* tp->t_force
|
|
|
|
* is set when we are called to send a persist packet.
|
|
|
|
* tp->t_timer[TCPT_REXMT]
|
|
|
|
* is set when we are retransmitting
|
|
|
|
* The output side is idle when both timers are zero.
|
|
|
|
*
|
|
|
|
* If send window is too small, there is data to transmit, and no
|
|
|
|
* retransmit or persist is pending, then go to persist state.
|
|
|
|
* If nothing happens soon, send when timer expires:
|
|
|
|
* if window is nonzero, transmit what we can,
|
|
|
|
* otherwise force out a byte.
|
|
|
|
*/
|
1998-05-06 05:21:20 +04:00
|
|
|
if (so->so_snd.sb_cc && TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0 &&
|
|
|
|
TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
|
1993-03-21 12:45:37 +03:00
|
|
|
tp->t_rxtshift = 0;
|
|
|
|
tcp_setpersist(tp);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* No reason to send a segment, just return.
|
|
|
|
*/
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
just_return:
|
2005-03-30 00:09:24 +04:00
|
|
|
TCP_REASS_UNLOCK(tp);
|
1993-03-21 12:45:37 +03:00
|
|
|
return (0);
|
|
|
|
|
|
|
|
send:
|
|
|
|
/*
|
|
|
|
* Before ESTABLISHED, force sending of initial options
|
|
|
|
* unless TCP set not to do any options.
|
|
|
|
* NOTE: we assume that the IP/TCP header plus TCP options
|
|
|
|
* always fit in a single mbuf, leaving room for a maximum
|
|
|
|
* link header, i.e.
|
1999-07-01 12:12:45 +04:00
|
|
|
* max_linkhdr + sizeof (struct tcpiphdr) + optlen <= MCLBYTES
|
1993-03-21 12:45:37 +03:00
|
|
|
*/
|
|
|
|
optlen = 0;
|
2017-01-02 04:18:42 +03:00
|
|
|
optp = opt;
|
1999-07-01 12:12:45 +04:00
|
|
|
switch (af) {
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1999-07-01 12:12:45 +04:00
|
|
|
case AF_INET:
|
|
|
|
iphdrlen = sizeof(struct ip) + sizeof(struct tcphdr);
|
|
|
|
break;
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
#ifdef INET6
|
|
|
|
case AF_INET6:
|
|
|
|
iphdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr);
|
|
|
|
break;
|
|
|
|
#endif
|
1999-07-02 16:45:32 +04:00
|
|
|
default: /*pacify gcc*/
|
|
|
|
iphdrlen = 0;
|
|
|
|
break;
|
1999-07-01 12:12:45 +04:00
|
|
|
}
|
|
|
|
hdrlen = iphdrlen;
|
1994-05-13 10:02:48 +04:00
|
|
|
if (flags & TH_SYN) {
|
2007-12-20 22:53:29 +03:00
|
|
|
struct rtentry *synrt;
|
1999-07-01 12:12:45 +04:00
|
|
|
|
2007-12-20 22:53:29 +03:00
|
|
|
synrt = NULL;
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1999-07-01 12:12:45 +04:00
|
|
|
if (tp->t_inpcb)
|
2007-12-20 22:53:29 +03:00
|
|
|
synrt = in_pcbrtentry(tp->t_inpcb);
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
2000-10-20 00:22:59 +04:00
|
|
|
#ifdef INET6
|
2000-10-17 07:06:42 +04:00
|
|
|
if (tp->t_in6pcb)
|
2007-12-20 22:53:29 +03:00
|
|
|
synrt = in6_pcbrtentry(tp->t_in6pcb);
|
1999-07-01 12:12:45 +04:00
|
|
|
#endif
|
1998-04-14 01:18:19 +04:00
|
|
|
|
1994-05-13 10:02:48 +04:00
|
|
|
tp->snd_nxt = tp->iss;
|
2007-12-20 22:53:29 +03:00
|
|
|
tp->t_ourmss = tcp_mss_to_advertise(synrt != NULL ?
|
|
|
|
synrt->rt_ifp : NULL, af);
|
2016-12-08 08:16:33 +03:00
|
|
|
#ifdef INET
|
|
|
|
if (tp->t_inpcb)
|
|
|
|
in_pcbrtentry_unref(synrt, tp->t_inpcb);
|
|
|
|
#endif
|
|
|
|
#ifdef INET6
|
|
|
|
if (tp->t_in6pcb)
|
|
|
|
in6_pcbrtentry_unref(synrt, tp->t_in6pcb);
|
|
|
|
#endif
|
2017-01-03 23:59:32 +03:00
|
|
|
if ((tp->t_flags & TF_NOOPT) == 0 && OPT_FITS(TCPOLEN_MAXSEG)) {
|
2017-01-02 04:18:42 +03:00
|
|
|
*optp++ = TCPOPT_MAXSEG;
|
|
|
|
*optp++ = TCPOLEN_MAXSEG;
|
|
|
|
*optp++ = (tp->t_ourmss >> 8) & 0xff;
|
|
|
|
*optp++ = tp->t_ourmss & 0xff;
|
|
|
|
optlen += TCPOLEN_MAXSEG;
|
2002-06-09 20:33:36 +04:00
|
|
|
|
1994-05-13 10:02:48 +04:00
|
|
|
if ((tp->t_flags & TF_REQ_SCALE) &&
|
|
|
|
((flags & TH_ACK) == 0 ||
|
2014-10-25 19:07:13 +04:00
|
|
|
(tp->t_flags & TF_RCVD_SCALE)) &&
|
2017-01-03 23:59:32 +03:00
|
|
|
OPT_FITS(TCPOLEN_WINDOW + TCPOLEN_NOP)) {
|
2017-01-02 04:18:42 +03:00
|
|
|
*((uint32_t *)optp) = htonl(
|
1994-05-13 10:02:48 +04:00
|
|
|
TCPOPT_NOP << 24 |
|
|
|
|
TCPOPT_WINDOW << 16 |
|
|
|
|
TCPOLEN_WINDOW << 8 |
|
|
|
|
tp->request_r_scale);
|
2017-01-02 04:18:42 +03:00
|
|
|
optp += TCPOLEN_WINDOW + TCPOLEN_NOP;
|
|
|
|
optlen += TCPOLEN_WINDOW + TCPOLEN_NOP;
|
1994-05-13 10:02:48 +04:00
|
|
|
}
|
2017-01-03 23:59:32 +03:00
|
|
|
if (tcp_do_sack && OPT_FITS(TCPOLEN_SACK_PERMITTED)) {
|
2017-01-02 04:18:42 +03:00
|
|
|
*optp++ = TCPOPT_SACK_PERMITTED;
|
|
|
|
*optp++ = TCPOLEN_SACK_PERMITTED;
|
|
|
|
optlen += TCPOLEN_SACK_PERMITTED;
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
}
|
1994-05-13 10:02:48 +04:00
|
|
|
}
|
2002-06-09 20:33:36 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Send a timestamp and echo-reply if this is a SYN and our side
|
1994-05-13 10:02:48 +04:00
|
|
|
* wants to use timestamps (TF_REQ_TSTMP is set) or both our side
|
|
|
|
* and our peer have sent timestamps in our SYN's.
|
2002-06-09 20:33:36 +04:00
|
|
|
*/
|
|
|
|
if ((tp->t_flags & (TF_REQ_TSTMP|TF_NOOPT)) == TF_REQ_TSTMP &&
|
|
|
|
(flags & TH_RST) == 0 &&
|
|
|
|
((flags & (TH_SYN|TH_ACK)) == TH_SYN ||
|
2017-01-02 04:18:42 +03:00
|
|
|
(tp->t_flags & TF_RCVD_TSTMP))) {
|
|
|
|
int alen = 0;
|
2017-01-04 15:35:14 +03:00
|
|
|
while (optlen % 4 != 2) {
|
2017-01-02 04:18:42 +03:00
|
|
|
optlen += TCPOLEN_NOP;
|
|
|
|
*optp++ = TCPOPT_NOP;
|
|
|
|
alen++;
|
|
|
|
}
|
|
|
|
if (OPT_FITS(TCPOLEN_TIMESTAMP)) {
|
|
|
|
*optp++ = TCPOPT_TIMESTAMP;
|
|
|
|
*optp++ = TCPOLEN_TIMESTAMP;
|
|
|
|
uint32_t *lp = (uint32_t *)optp;
|
|
|
|
/* Form timestamp option (appendix A of RFC 1323) */
|
|
|
|
*lp++ = htonl(TCP_TIMESTAMP(tp));
|
|
|
|
*lp = htonl(tp->ts_recent);
|
|
|
|
optp += TCPOLEN_TIMESTAMP - 2;
|
|
|
|
optlen += TCPOLEN_TIMESTAMP;
|
|
|
|
|
|
|
|
/* Set receive buffer autosizing timestamp. */
|
|
|
|
if (tp->rfbuf_ts == 0 &&
|
|
|
|
(so->so_rcv.sb_flags & SB_AUTOSIZE))
|
|
|
|
tp->rfbuf_ts = TCP_TIMESTAMP(tp);
|
|
|
|
} else {
|
|
|
|
optp -= alen;
|
|
|
|
optlen -= alen;
|
|
|
|
}
|
2002-06-09 20:33:36 +04:00
|
|
|
}
|
|
|
|
|
2017-01-02 04:18:42 +03:00
|
|
|
#ifdef TCP_SIGNATURE
|
|
|
|
if (tp->t_flags & TF_SIGNATURE) {
|
|
|
|
/*
|
|
|
|
* Initialize TCP-MD5 option (RFC2385)
|
|
|
|
*/
|
2017-01-02 05:38:54 +03:00
|
|
|
if (!OPT_FITS(TCPOLEN_SIGNATURE))
|
|
|
|
goto reset;
|
|
|
|
|
|
|
|
*optp++ = TCPOPT_SIGNATURE;
|
|
|
|
*optp++ = TCPOLEN_SIGNATURE;
|
|
|
|
sigoff = optlen + 2;
|
|
|
|
memset(optp, 0, TCP_SIGLEN);
|
|
|
|
optlen += TCPOLEN_SIGNATURE;
|
|
|
|
optp += TCP_SIGLEN;
|
2017-01-02 04:18:42 +03:00
|
|
|
}
|
|
|
|
#endif /* TCP_SIGNATURE */
|
|
|
|
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
/*
|
|
|
|
* Tack on the SACK block if it is necessary.
|
|
|
|
*/
|
2005-03-16 03:39:56 +03:00
|
|
|
if (sack_numblks) {
|
2017-01-02 04:18:42 +03:00
|
|
|
int alen = 0;
|
|
|
|
int sack_len = sack_numblks * 8;
|
2017-01-04 15:35:14 +03:00
|
|
|
while (optlen % 4 != 2) {
|
2017-01-02 04:18:42 +03:00
|
|
|
optlen += TCPOLEN_NOP;
|
|
|
|
*optp++ = TCPOPT_NOP;
|
|
|
|
alen++;
|
|
|
|
}
|
2014-10-25 19:07:13 +04:00
|
|
|
if (OPT_FITS(sack_len + 2)) {
|
2017-01-02 04:18:42 +03:00
|
|
|
struct ipqent *tiqe;
|
|
|
|
*optp++ = TCPOPT_SACK;
|
|
|
|
*optp++ = sack_len + 2;
|
|
|
|
uint32_t *lp = (uint32_t *)optp;
|
2014-10-25 19:07:13 +04:00
|
|
|
if ((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) {
|
|
|
|
sack_numblks--;
|
|
|
|
*lp++ = htonl(tp->rcv_dsack_block.left);
|
|
|
|
*lp++ = htonl(tp->rcv_dsack_block.right);
|
|
|
|
tp->rcv_sack_flags &= ~TCPSACK_HAVED;
|
|
|
|
}
|
|
|
|
for (tiqe = TAILQ_FIRST(&tp->timeq);
|
|
|
|
sack_numblks > 0;
|
|
|
|
tiqe = TAILQ_NEXT(tiqe, ipqe_timeq)) {
|
|
|
|
KASSERT(tiqe != NULL);
|
|
|
|
sack_numblks--;
|
|
|
|
*lp++ = htonl(tiqe->ipqe_seq);
|
|
|
|
*lp++ = htonl(tiqe->ipqe_seq + tiqe->ipqe_len +
|
|
|
|
((tiqe->ipqe_flags & TH_FIN) != 0 ? 1 : 0));
|
|
|
|
}
|
2017-01-04 18:09:37 +03:00
|
|
|
optlen += sack_len + 2;
|
2017-01-02 04:18:42 +03:00
|
|
|
optp += sack_len;
|
|
|
|
} else {
|
|
|
|
optp -= alen;
|
|
|
|
optlen -= alen;
|
2005-03-16 03:39:56 +03:00
|
|
|
}
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
}
|
|
|
|
|
2017-01-02 04:18:42 +03:00
|
|
|
/* Terminate and pad TCP options to a 4 byte boundary. */
|
|
|
|
if (optlen % 4) {
|
2017-01-03 23:59:32 +03:00
|
|
|
if (!OPT_FITS(TCPOLEN_EOL)) {
|
2017-01-03 16:09:33 +03:00
|
|
|
reset: TCP_REASS_UNLOCK(tp);
|
2017-01-02 05:38:54 +03:00
|
|
|
error = ECONNABORTED;
|
|
|
|
goto out;
|
|
|
|
}
|
2017-01-02 04:18:42 +03:00
|
|
|
optlen += TCPOLEN_EOL;
|
|
|
|
*optp++ = TCPOPT_EOL;
|
2015-05-16 04:15:34 +03:00
|
|
|
}
|
2017-01-02 04:18:42 +03:00
|
|
|
/*
|
|
|
|
* According to RFC 793 (STD0007):
|
|
|
|
* "The content of the header beyond the End-of-Option option
|
|
|
|
* must be header padding (i.e., zero)."
|
|
|
|
* and later: "The padding is composed of zeros."
|
|
|
|
*/
|
|
|
|
while (optlen % 4) {
|
2017-01-03 23:59:32 +03:00
|
|
|
if (!OPT_FITS(TCPOLEN_PAD))
|
2017-01-02 04:18:42 +03:00
|
|
|
goto reset;
|
|
|
|
optlen += TCPOLEN_PAD;
|
|
|
|
*optp++ = TCPOPT_PAD;
|
|
|
|
}
|
|
|
|
|
|
|
|
TCP_REASS_UNLOCK(tp);
|
Initial commit of a port of the FreeBSD implementation of RFC 2385
(MD5 signatures for TCP, as used with BGP). Credit for original
FreeBSD code goes to Bruce M. Simpson, with FreeBSD sponsorship
credited to sentex.net. Shortening of the setsockopt() name
attributed to Vincent Jardin.
This commit is a minimal, working version of the FreeBSD code, as
MFC'ed to FreeBSD-4. It has received minimal testing with a ttcp
modified to set the TCP-MD5 option; BMS's additions to tcpdump-current
(tcpdump -M) confirm that the MD5 signatures are correct. Committed
as-is for further testing between a NetBSD BGP speaker (e.g., quagga)
and industry-standard BGP speakers (e.g., Cisco, Juniper).
NOTE: This version has two potential flaws. First, I do see any code
that verifies recieved TCP-MD5 signatures. Second, the TCP-MD5
options are internally padded and assumed to be 32-bit aligned. A more
space-efficient scheme is to pack all TCP options densely (and
possibly unaligned) into the TCP header ; then do one final padding to
a 4-byte boundary. Pre-existing comments note that accounting for
TCP-option space when we add SACK is yet to be done. For now, I'm
punting on that; we can solve it properly, in a way that will handle
SACK blocks, as a separate exercise.
In case a pullup to NetBSD-2 is requested, this adds sys/netipsec/xform_tcp.c
,and modifies:
sys/net/pfkeyv2.h,v 1.15
sys/netinet/files.netinet,v 1.5
sys/netinet/ip.h,v 1.25
sys/netinet/tcp.h,v 1.15
sys/netinet/tcp_input.c,v 1.200
sys/netinet/tcp_output.c,v 1.109
sys/netinet/tcp_subr.c,v 1.165
sys/netinet/tcp_usrreq.c,v 1.89
sys/netinet/tcp_var.h,v 1.109
sys/netipsec/files.netipsec,v 1.3
sys/netipsec/ipsec.c,v 1.11
sys/netipsec/ipsec.h,v 1.7
sys/netipsec/key.c,v 1.11
share/man/man4/tcp.4,v 1.16
lib/libipsec/pfkey.c,v 1.20
lib/libipsec/pfkey_dump.c,v 1.17
lib/libipsec/policy_token.l,v 1.8
sbin/setkey/parse.y,v 1.14
sbin/setkey/setkey.8,v 1.27
sbin/setkey/token.l,v 1.15
Note that the preceding two revisions to tcp.4 will be
required to cleanly apply this diff.
2004-04-26 02:25:03 +04:00
|
|
|
|
2002-06-09 20:33:36 +04:00
|
|
|
hdrlen += optlen;
|
|
|
|
|
1993-03-21 12:45:37 +03:00
|
|
|
#ifdef DIAGNOSTIC
|
2005-03-06 03:35:07 +03:00
|
|
|
if (!use_tso && len > txsegsize)
|
1998-03-18 02:50:30 +03:00
|
|
|
panic("tcp data to be sent is larger than segment");
|
2005-03-06 03:35:07 +03:00
|
|
|
else if (use_tso && len > IP_MAXPACKET)
|
|
|
|
panic("tcp data to be sent is larger than max TSO size");
|
2002-06-09 20:33:36 +04:00
|
|
|
if (max_linkhdr + hdrlen > MCLBYTES)
|
1994-05-13 10:02:48 +04:00
|
|
|
panic("tcphdr too big");
|
1993-03-21 12:45:37 +03:00
|
|
|
#endif
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Grab a header mbuf, attaching a copy of data to
|
|
|
|
* be transmitted, and initialize the header from
|
|
|
|
* the template for sends on this connection.
|
|
|
|
*/
|
|
|
|
if (len) {
|
2001-07-31 06:25:22 +04:00
|
|
|
error = tcp_build_datapkt(tp, so, off, len, hdrlen, &m);
|
|
|
|
if (error)
|
1993-03-21 12:45:37 +03:00
|
|
|
goto out;
|
|
|
|
/*
|
|
|
|
* If we're sending everything we've got, set PUSH.
|
|
|
|
* (This will keep happy those implementations which only
|
|
|
|
* give data to the user when a buffer fills or
|
|
|
|
* a PUSH comes in.)
|
|
|
|
*/
|
|
|
|
if (off + len == so->so_snd.sb_cc)
|
|
|
|
flags |= TH_PUSH;
|
|
|
|
} else {
|
2008-04-12 09:58:22 +04:00
|
|
|
tcps = TCP_STAT_GETREF();
|
1993-03-21 12:45:37 +03:00
|
|
|
if (tp->t_flags & TF_ACKNOW)
|
2008-04-12 09:58:22 +04:00
|
|
|
tcps[TCP_STAT_SNDACKS]++;
|
1993-03-21 12:45:37 +03:00
|
|
|
else if (flags & (TH_SYN|TH_FIN|TH_RST))
|
2008-04-12 09:58:22 +04:00
|
|
|
tcps[TCP_STAT_SNDCTRL]++;
|
1993-03-21 12:45:37 +03:00
|
|
|
else if (SEQ_GT(tp->snd_up, tp->snd_una))
|
2008-04-12 09:58:22 +04:00
|
|
|
tcps[TCP_STAT_SNDURG]++;
|
1993-03-21 12:45:37 +03:00
|
|
|
else
|
2008-04-12 09:58:22 +04:00
|
|
|
tcps[TCP_STAT_SNDWINUP]++;
|
|
|
|
TCP_STAT_PUTREF();
|
1993-03-21 12:45:37 +03:00
|
|
|
|
|
|
|
MGETHDR(m, M_DONTWAIT, MT_HEADER);
|
2000-02-09 03:50:40 +03:00
|
|
|
if (m != NULL && max_linkhdr + hdrlen > MHLEN) {
|
1999-07-01 12:12:45 +04:00
|
|
|
MCLGET(m, M_DONTWAIT);
|
|
|
|
if ((m->m_flags & M_EXT) == 0) {
|
|
|
|
m_freem(m);
|
|
|
|
m = NULL;
|
|
|
|
}
|
|
|
|
}
|
1993-03-21 12:45:37 +03:00
|
|
|
if (m == NULL) {
|
|
|
|
error = ENOBUFS;
|
|
|
|
goto out;
|
|
|
|
}
|
2003-02-26 09:31:08 +03:00
|
|
|
MCLAIM(m, &tcp_tx_mowner);
|
1993-03-21 12:45:37 +03:00
|
|
|
m->m_data += max_linkhdr;
|
|
|
|
m->m_len = hdrlen;
|
|
|
|
}
|
2016-06-10 16:27:10 +03:00
|
|
|
m_reset_rcvif(m);
|
1999-07-01 12:12:45 +04:00
|
|
|
switch (af) {
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1999-07-01 12:12:45 +04:00
|
|
|
case AF_INET:
|
|
|
|
ip = mtod(m, struct ip *);
|
|
|
|
#ifdef INET6
|
|
|
|
ip6 = NULL;
|
|
|
|
#endif
|
|
|
|
th = (struct tcphdr *)(ip + 1);
|
|
|
|
break;
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
#ifdef INET6
|
|
|
|
case AF_INET6:
|
|
|
|
ip = NULL;
|
|
|
|
ip6 = mtod(m, struct ip6_hdr *);
|
|
|
|
th = (struct tcphdr *)(ip6 + 1);
|
|
|
|
break;
|
|
|
|
#endif
|
1999-07-02 16:45:32 +04:00
|
|
|
default: /*pacify gcc*/
|
|
|
|
ip = NULL;
|
1999-07-03 01:02:05 +04:00
|
|
|
#ifdef INET6
|
1999-07-02 16:45:32 +04:00
|
|
|
ip6 = NULL;
|
1999-07-03 01:02:05 +04:00
|
|
|
#endif
|
1999-07-02 16:45:32 +04:00
|
|
|
th = NULL;
|
|
|
|
break;
|
1999-07-01 12:12:45 +04:00
|
|
|
}
|
1993-03-21 12:45:37 +03:00
|
|
|
if (tp->t_template == 0)
|
|
|
|
panic("tcp_output");
|
1999-07-01 12:12:45 +04:00
|
|
|
if (tp->t_template->m_len < iphdrlen)
|
|
|
|
panic("tcp_output");
|
2007-03-04 08:59:00 +03:00
|
|
|
bcopy(mtod(tp->t_template, void *), mtod(m, void *), iphdrlen);
|
1993-03-21 12:45:37 +03:00
|
|
|
|
2006-09-05 04:29:35 +04:00
|
|
|
/*
|
|
|
|
* If we are starting a connection, send ECN setup
|
|
|
|
* SYN packet. If we are on a retransmit, we may
|
|
|
|
* resend those bits a number of times as per
|
|
|
|
* RFC 3168.
|
|
|
|
*/
|
|
|
|
if (tp->t_state == TCPS_SYN_SENT && tcp_do_ecn) {
|
|
|
|
if (tp->t_flags & TF_SYN_REXMT) {
|
|
|
|
if (tp->t_ecn_retries--)
|
|
|
|
flags |= TH_ECE|TH_CWR;
|
|
|
|
} else {
|
|
|
|
flags |= TH_ECE|TH_CWR;
|
|
|
|
tp->t_ecn_retries = tcp_ecn_maxretries;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (TCP_ECN_ALLOWED(tp)) {
|
|
|
|
/*
|
|
|
|
* If the peer has ECN, mark data packets
|
|
|
|
* ECN capable. Ignore pure ack packets, retransmissions
|
|
|
|
* and window probes.
|
|
|
|
*/
|
|
|
|
if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) &&
|
|
|
|
!(tp->t_force && len == 1)) {
|
2011-03-21 23:39:32 +03:00
|
|
|
ecn_tos = IPTOS_ECN_ECT0;
|
2008-04-12 09:58:22 +04:00
|
|
|
TCP_STATINC(TCP_STAT_ECN_ECT);
|
2006-09-05 04:29:35 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reply with proper ECN notifications.
|
|
|
|
*/
|
|
|
|
if (tp->t_flags & TF_ECN_SND_CWR) {
|
|
|
|
flags |= TH_CWR;
|
|
|
|
tp->t_flags &= ~TF_ECN_SND_CWR;
|
|
|
|
}
|
|
|
|
if (tp->t_flags & TF_ECN_SND_ECE) {
|
|
|
|
flags |= TH_ECE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
1994-05-13 10:02:48 +04:00
|
|
|
/*
|
|
|
|
* If we are doing retransmissions, then snd_nxt will
|
|
|
|
* not reflect the first unsent octet. For ACK only
|
|
|
|
* packets, we do not want the sequence number of the
|
|
|
|
* retransmitted packet, we want the sequence number
|
|
|
|
* of the next unsent octet. So, if there is no data
|
|
|
|
* (and no SYN or FIN), use snd_max instead of snd_nxt
|
|
|
|
* when filling in ti_seq. But if we are in persist
|
|
|
|
* state, snd_max might reflect one byte beyond the
|
|
|
|
* right edge of the window, so use snd_nxt in that
|
|
|
|
* case, since we know we aren't doing a retransmission.
|
|
|
|
* (retransmit and persist are mutually exclusive...)
|
|
|
|
*/
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
if (TCP_SACK_ENABLED(tp) && sack_rxmit) {
|
|
|
|
th->th_seq = htonl(p->rxmit);
|
|
|
|
p->rxmit += len;
|
|
|
|
} else {
|
|
|
|
if (len || (flags & (TH_SYN|TH_FIN)) ||
|
|
|
|
TCP_TIMER_ISARMED(tp, TCPT_PERSIST))
|
|
|
|
th->th_seq = htonl(tp->snd_nxt);
|
|
|
|
else
|
|
|
|
th->th_seq = htonl(tp->snd_max);
|
|
|
|
}
|
1999-07-01 12:12:45 +04:00
|
|
|
th->th_ack = htonl(tp->rcv_nxt);
|
1993-03-21 12:45:37 +03:00
|
|
|
if (optlen) {
|
2007-03-04 08:59:00 +03:00
|
|
|
bcopy((void *)opt, (void *)(th + 1), optlen);
|
1999-07-01 12:12:45 +04:00
|
|
|
th->th_off = (sizeof (struct tcphdr) + optlen) >> 2;
|
1993-03-21 12:45:37 +03:00
|
|
|
}
|
1999-07-01 12:12:45 +04:00
|
|
|
th->th_flags = flags;
|
1993-03-21 12:45:37 +03:00
|
|
|
/*
|
|
|
|
* Calculate receive window. Don't shrink window,
|
|
|
|
* but avoid silly window syndrome.
|
|
|
|
*/
|
1997-10-08 20:32:48 +04:00
|
|
|
if (win < (long)(so->so_rcv.sb_hiwat / 4) && win < (long)rxsegsize)
|
1993-03-21 12:45:37 +03:00
|
|
|
win = 0;
|
1994-05-13 10:02:48 +04:00
|
|
|
if (win > (long)TCP_MAXWIN << tp->rcv_scale)
|
|
|
|
win = (long)TCP_MAXWIN << tp->rcv_scale;
|
2004-05-08 18:41:47 +04:00
|
|
|
if (win < (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt))
|
|
|
|
win = (long)(int32_t)(tp->rcv_adv - tp->rcv_nxt);
|
1999-07-01 12:12:45 +04:00
|
|
|
th->th_win = htons((u_int16_t) (win>>tp->rcv_scale));
|
2015-02-14 15:57:52 +03:00
|
|
|
if (th->th_win == 0) {
|
|
|
|
tp->t_sndzerowin++;
|
|
|
|
}
|
1993-03-21 12:45:37 +03:00
|
|
|
if (SEQ_GT(tp->snd_up, tp->snd_nxt)) {
|
1997-06-03 20:17:09 +04:00
|
|
|
u_int32_t urp = tp->snd_up - tp->snd_nxt;
|
|
|
|
if (urp > IP_MAXPACKET)
|
|
|
|
urp = IP_MAXPACKET;
|
1999-07-01 12:12:45 +04:00
|
|
|
th->th_urp = htons((u_int16_t)urp);
|
|
|
|
th->th_flags |= TH_URG;
|
1993-03-21 12:45:37 +03:00
|
|
|
} else
|
|
|
|
/*
|
|
|
|
* If no urgent pointer to send, then we pull
|
|
|
|
* the urgent pointer to the left edge of the send window
|
|
|
|
* so that it doesn't drift into the send window on sequence
|
|
|
|
* number wraparound.
|
|
|
|
*/
|
|
|
|
tp->snd_up = tp->snd_una; /* drag it along */
|
|
|
|
|
Initial commit of a port of the FreeBSD implementation of RFC 2385
(MD5 signatures for TCP, as used with BGP). Credit for original
FreeBSD code goes to Bruce M. Simpson, with FreeBSD sponsorship
credited to sentex.net. Shortening of the setsockopt() name
attributed to Vincent Jardin.
This commit is a minimal, working version of the FreeBSD code, as
MFC'ed to FreeBSD-4. It has received minimal testing with a ttcp
modified to set the TCP-MD5 option; BMS's additions to tcpdump-current
(tcpdump -M) confirm that the MD5 signatures are correct. Committed
as-is for further testing between a NetBSD BGP speaker (e.g., quagga)
and industry-standard BGP speakers (e.g., Cisco, Juniper).
NOTE: This version has two potential flaws. First, I do see any code
that verifies recieved TCP-MD5 signatures. Second, the TCP-MD5
options are internally padded and assumed to be 32-bit aligned. A more
space-efficient scheme is to pack all TCP options densely (and
possibly unaligned) into the TCP header ; then do one final padding to
a 4-byte boundary. Pre-existing comments note that accounting for
TCP-option space when we add SACK is yet to be done. For now, I'm
punting on that; we can solve it properly, in a way that will handle
SACK blocks, as a separate exercise.
In case a pullup to NetBSD-2 is requested, this adds sys/netipsec/xform_tcp.c
,and modifies:
sys/net/pfkeyv2.h,v 1.15
sys/netinet/files.netinet,v 1.5
sys/netinet/ip.h,v 1.25
sys/netinet/tcp.h,v 1.15
sys/netinet/tcp_input.c,v 1.200
sys/netinet/tcp_output.c,v 1.109
sys/netinet/tcp_subr.c,v 1.165
sys/netinet/tcp_usrreq.c,v 1.89
sys/netinet/tcp_var.h,v 1.109
sys/netipsec/files.netipsec,v 1.3
sys/netipsec/ipsec.c,v 1.11
sys/netipsec/ipsec.h,v 1.7
sys/netipsec/key.c,v 1.11
share/man/man4/tcp.4,v 1.16
lib/libipsec/pfkey.c,v 1.20
lib/libipsec/pfkey_dump.c,v 1.17
lib/libipsec/policy_token.l,v 1.8
sbin/setkey/parse.y,v 1.14
sbin/setkey/setkey.8,v 1.27
sbin/setkey/token.l,v 1.15
Note that the preceding two revisions to tcp.4 will be
required to cleanly apply this diff.
2004-04-26 02:25:03 +04:00
|
|
|
#ifdef TCP_SIGNATURE
|
2004-05-18 18:44:14 +04:00
|
|
|
if (sigoff && (tp->t_flags & TF_SIGNATURE)) {
|
|
|
|
struct secasvar *sav;
|
|
|
|
u_int8_t *sigp;
|
|
|
|
|
|
|
|
sav = tcp_signature_getsav(m, th);
|
2005-02-27 01:45:09 +03:00
|
|
|
|
2004-05-18 18:44:14 +04:00
|
|
|
if (sav == NULL) {
|
|
|
|
if (m)
|
|
|
|
m_freem(m);
|
|
|
|
return (EPERM);
|
|
|
|
}
|
|
|
|
|
|
|
|
m->m_pkthdr.len = hdrlen + len;
|
2007-05-19 01:48:43 +04:00
|
|
|
sigp = (char *)th + sizeof(*th) + sigoff;
|
|
|
|
tcp_signature(m, th, (char *)th - mtod(m, char *), sav, sigp);
|
2004-05-18 18:44:14 +04:00
|
|
|
|
|
|
|
key_sa_recordxfer(sav, m);
|
|
|
|
KEY_FREESAV(&sav);
|
|
|
|
}
|
Initial commit of a port of the FreeBSD implementation of RFC 2385
(MD5 signatures for TCP, as used with BGP). Credit for original
FreeBSD code goes to Bruce M. Simpson, with FreeBSD sponsorship
credited to sentex.net. Shortening of the setsockopt() name
attributed to Vincent Jardin.
This commit is a minimal, working version of the FreeBSD code, as
MFC'ed to FreeBSD-4. It has received minimal testing with a ttcp
modified to set the TCP-MD5 option; BMS's additions to tcpdump-current
(tcpdump -M) confirm that the MD5 signatures are correct. Committed
as-is for further testing between a NetBSD BGP speaker (e.g., quagga)
and industry-standard BGP speakers (e.g., Cisco, Juniper).
NOTE: This version has two potential flaws. First, I do see any code
that verifies recieved TCP-MD5 signatures. Second, the TCP-MD5
options are internally padded and assumed to be 32-bit aligned. A more
space-efficient scheme is to pack all TCP options densely (and
possibly unaligned) into the TCP header ; then do one final padding to
a 4-byte boundary. Pre-existing comments note that accounting for
TCP-option space when we add SACK is yet to be done. For now, I'm
punting on that; we can solve it properly, in a way that will handle
SACK blocks, as a separate exercise.
In case a pullup to NetBSD-2 is requested, this adds sys/netipsec/xform_tcp.c
,and modifies:
sys/net/pfkeyv2.h,v 1.15
sys/netinet/files.netinet,v 1.5
sys/netinet/ip.h,v 1.25
sys/netinet/tcp.h,v 1.15
sys/netinet/tcp_input.c,v 1.200
sys/netinet/tcp_output.c,v 1.109
sys/netinet/tcp_subr.c,v 1.165
sys/netinet/tcp_usrreq.c,v 1.89
sys/netinet/tcp_var.h,v 1.109
sys/netipsec/files.netipsec,v 1.3
sys/netipsec/ipsec.c,v 1.11
sys/netipsec/ipsec.h,v 1.7
sys/netipsec/key.c,v 1.11
share/man/man4/tcp.4,v 1.16
lib/libipsec/pfkey.c,v 1.20
lib/libipsec/pfkey_dump.c,v 1.17
lib/libipsec/policy_token.l,v 1.8
sbin/setkey/parse.y,v 1.14
sbin/setkey/setkey.8,v 1.27
sbin/setkey/token.l,v 1.15
Note that the preceding two revisions to tcp.4 will be
required to cleanly apply this diff.
2004-04-26 02:25:03 +04:00
|
|
|
#endif
|
|
|
|
|
1993-03-21 12:45:37 +03:00
|
|
|
/*
|
2001-06-02 20:17:09 +04:00
|
|
|
* Set ourselves up to be checksummed just before the packet
|
2005-04-19 01:50:25 +04:00
|
|
|
* hits the wire.
|
1993-03-21 12:45:37 +03:00
|
|
|
*/
|
1999-07-01 12:12:45 +04:00
|
|
|
switch (af) {
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1999-07-01 12:12:45 +04:00
|
|
|
case AF_INET:
|
2005-03-09 06:38:33 +03:00
|
|
|
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
|
2005-03-06 03:35:07 +03:00
|
|
|
if (use_tso) {
|
|
|
|
m->m_pkthdr.segsz = txsegsize;
|
2005-03-09 06:38:33 +03:00
|
|
|
m->m_pkthdr.csum_flags = M_CSUM_TSOv4;
|
2005-03-06 03:35:07 +03:00
|
|
|
} else {
|
2005-04-19 01:50:25 +04:00
|
|
|
m->m_pkthdr.csum_flags = M_CSUM_TCPv4;
|
2005-03-06 03:35:07 +03:00
|
|
|
if (len + optlen) {
|
|
|
|
/* Fixup the pseudo-header checksum. */
|
|
|
|
/* XXXJRT Not IP Jumbogram safe. */
|
|
|
|
th->th_sum = in_cksum_addword(th->th_sum,
|
|
|
|
htons((u_int16_t) (len + optlen)));
|
|
|
|
}
|
2001-06-02 20:17:09 +04:00
|
|
|
}
|
1999-07-01 12:12:45 +04:00
|
|
|
break;
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
#ifdef INET6
|
|
|
|
case AF_INET6:
|
2001-06-02 20:17:09 +04:00
|
|
|
m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
|
2006-11-23 22:41:58 +03:00
|
|
|
if (use_tso) {
|
|
|
|
m->m_pkthdr.segsz = txsegsize;
|
|
|
|
m->m_pkthdr.csum_flags = M_CSUM_TSOv6;
|
|
|
|
} else {
|
|
|
|
m->m_pkthdr.csum_flags = M_CSUM_TCPv6;
|
|
|
|
if (len + optlen) {
|
|
|
|
/* Fixup the pseudo-header checksum. */
|
|
|
|
/* XXXJRT: Not IPv6 Jumbogram safe. */
|
|
|
|
th->th_sum = in_cksum_addword(th->th_sum,
|
|
|
|
htons((u_int16_t) (len + optlen)));
|
|
|
|
}
|
2001-06-02 20:17:09 +04:00
|
|
|
}
|
1999-07-01 12:12:45 +04:00
|
|
|
break;
|
|
|
|
#endif
|
|
|
|
}
|
1993-03-21 12:45:37 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* In transmit state, time the transmission and arrange for
|
|
|
|
* the retransmit. In persist state, just set snd_max.
|
|
|
|
*/
|
1998-05-06 05:21:20 +04:00
|
|
|
if (tp->t_force == 0 || TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
|
1993-03-21 12:45:37 +03:00
|
|
|
tcp_seq startseq = tp->snd_nxt;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Advance snd_nxt over sequence space of this segment.
|
1998-07-21 14:46:00 +04:00
|
|
|
* There are no states in which we send both a SYN and a FIN,
|
|
|
|
* so we collapse the tests for these flags.
|
1993-03-21 12:45:37 +03:00
|
|
|
*/
|
1998-07-21 14:46:00 +04:00
|
|
|
if (flags & (TH_SYN|TH_FIN))
|
|
|
|
tp->snd_nxt++;
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
if (sack_rxmit)
|
|
|
|
goto timer;
|
1993-03-21 12:45:37 +03:00
|
|
|
tp->snd_nxt += len;
|
|
|
|
if (SEQ_GT(tp->snd_nxt, tp->snd_max)) {
|
|
|
|
tp->snd_max = tp->snd_nxt;
|
|
|
|
/*
|
|
|
|
* Time this transmission if not a retransmission and
|
|
|
|
* not currently timing anything.
|
|
|
|
*/
|
2001-09-10 19:23:09 +04:00
|
|
|
if (tp->t_rtttime == 0) {
|
|
|
|
tp->t_rtttime = tcp_now;
|
1993-03-21 12:45:37 +03:00
|
|
|
tp->t_rtseq = startseq;
|
2008-04-12 09:58:22 +04:00
|
|
|
TCP_STATINC(TCP_STAT_SEGSTIMED);
|
1993-03-21 12:45:37 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Set retransmit timer if not currently set,
|
|
|
|
* and not doing an ack or a keep-alive probe.
|
|
|
|
* Initial value for retransmit timer is smoothed
|
|
|
|
* round-trip time + 2 * round-trip time variance.
|
|
|
|
* Initialize shift counter which is used for backoff
|
|
|
|
* of retransmit time.
|
|
|
|
*/
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
timer:
|
2015-07-24 07:33:50 +03:00
|
|
|
if (TCP_TIMER_ISARMED(tp, TCPT_REXMT) == 0) {
|
|
|
|
if ((sack_rxmit && tp->snd_nxt != tp->snd_max)
|
|
|
|
|| tp->snd_nxt != tp->snd_una) {
|
|
|
|
if (TCP_TIMER_ISARMED(tp, TCPT_PERSIST)) {
|
|
|
|
TCP_TIMER_DISARM(tp, TCPT_PERSIST);
|
|
|
|
tp->t_rxtshift = 0;
|
|
|
|
}
|
|
|
|
TCP_TIMER_ARM(tp, TCPT_REXMT, tp->t_rxtcur);
|
|
|
|
} else if (len == 0 && so->so_snd.sb_cc > 0
|
|
|
|
&& TCP_TIMER_ISARMED(tp, TCPT_PERSIST) == 0) {
|
|
|
|
/*
|
|
|
|
* If we are sending a window probe and there's
|
|
|
|
* unacked data in the socket, make sure at
|
|
|
|
* least the persist timer is running.
|
|
|
|
*/
|
1993-03-21 12:45:37 +03:00
|
|
|
tp->t_rxtshift = 0;
|
2015-07-24 07:33:50 +03:00
|
|
|
tcp_setpersist(tp);
|
1993-03-21 12:45:37 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
} else
|
|
|
|
if (SEQ_GT(tp->snd_nxt + len, tp->snd_max))
|
|
|
|
tp->snd_max = tp->snd_nxt + len;
|
|
|
|
|
2001-07-08 20:18:56 +04:00
|
|
|
#ifdef TCP_DEBUG
|
1993-03-21 12:45:37 +03:00
|
|
|
/*
|
|
|
|
* Trace.
|
|
|
|
*/
|
2003-05-17 21:16:20 +04:00
|
|
|
if (so->so_options & SO_DEBUG)
|
1999-07-01 12:12:45 +04:00
|
|
|
tcp_trace(TA_OUTPUT, tp->t_state, tp, m, 0);
|
2001-07-08 20:18:56 +04:00
|
|
|
#endif
|
1993-03-21 12:45:37 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Fill in IP length and desired time to live and
|
|
|
|
* send to IP level. There should be a better way
|
|
|
|
* to handle ttl and tos; we could keep them in
|
|
|
|
* the template, but need a way to checksum without them.
|
|
|
|
*/
|
|
|
|
m->m_pkthdr.len = hdrlen + len;
|
1997-10-18 02:12:14 +04:00
|
|
|
|
1999-07-01 12:12:45 +04:00
|
|
|
switch (af) {
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1999-07-01 12:12:45 +04:00
|
|
|
case AF_INET:
|
2002-08-14 04:23:27 +04:00
|
|
|
ip->ip_len = htons(m->m_pkthdr.len);
|
2005-07-19 21:00:02 +04:00
|
|
|
packetlen = m->m_pkthdr.len;
|
1999-07-01 12:12:45 +04:00
|
|
|
if (tp->t_inpcb) {
|
|
|
|
ip->ip_ttl = tp->t_inpcb->inp_ip.ip_ttl;
|
2011-03-21 23:39:32 +03:00
|
|
|
ip->ip_tos = tp->t_inpcb->inp_ip.ip_tos | ecn_tos;
|
1999-07-01 12:12:45 +04:00
|
|
|
}
|
|
|
|
#ifdef INET6
|
|
|
|
else if (tp->t_in6pcb) {
|
2000-11-06 03:50:12 +03:00
|
|
|
ip->ip_ttl = in6_selecthlim(tp->t_in6pcb, NULL); /*XXX*/
|
2011-03-21 23:39:32 +03:00
|
|
|
ip->ip_tos = ecn_tos; /*XXX*/
|
1999-07-01 12:12:45 +04:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
break;
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
#ifdef INET6
|
|
|
|
case AF_INET6:
|
2005-07-19 21:00:02 +04:00
|
|
|
packetlen = m->m_pkthdr.len;
|
1999-07-01 12:12:45 +04:00
|
|
|
ip6->ip6_nxt = IPPROTO_TCP;
|
1999-12-13 18:17:17 +03:00
|
|
|
if (tp->t_in6pcb) {
|
|
|
|
/*
|
|
|
|
* we separately set hoplimit for every segment, since
|
|
|
|
* the user might want to change the value via
|
|
|
|
* setsockopt. Also, desired default hop limit might
|
|
|
|
* be changed via Neighbor Discovery.
|
|
|
|
*/
|
2015-04-27 05:59:44 +03:00
|
|
|
ip6->ip6_hlim = in6_selecthlim_rt(tp->t_in6pcb);
|
1999-12-13 18:17:17 +03:00
|
|
|
}
|
2011-03-21 23:39:32 +03:00
|
|
|
ip6->ip6_flow |= htonl(ecn_tos << 20);
|
|
|
|
/* ip6->ip6_flow = ??? (from template) */
|
1999-07-01 12:12:45 +04:00
|
|
|
/* ip6_plen will be filled in ip6_output(). */
|
|
|
|
break;
|
|
|
|
#endif
|
2005-07-19 21:00:02 +04:00
|
|
|
default: /*pacify gcc*/
|
|
|
|
packetlen = 0;
|
|
|
|
break;
|
1999-07-01 12:12:45 +04:00
|
|
|
}
|
1997-10-18 02:12:14 +04:00
|
|
|
|
1999-07-01 12:12:45 +04:00
|
|
|
switch (af) {
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1999-07-01 12:12:45 +04:00
|
|
|
case AF_INET:
|
|
|
|
{
|
|
|
|
struct mbuf *opts;
|
|
|
|
|
|
|
|
if (tp->t_inpcb)
|
|
|
|
opts = tp->t_inpcb->inp_options;
|
|
|
|
else
|
|
|
|
opts = NULL;
|
|
|
|
error = ip_output(m, opts, ro,
|
2002-05-26 20:05:43 +04:00
|
|
|
(tp->t_mtudisc ? IP_MTUDISC : 0) |
|
2012-01-01 00:41:58 +04:00
|
|
|
(so->so_options & SO_DONTROUTE), NULL, so);
|
1999-07-01 12:12:45 +04:00
|
|
|
break;
|
|
|
|
}
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
1999-07-01 12:12:45 +04:00
|
|
|
#ifdef INET6
|
|
|
|
case AF_INET6:
|
|
|
|
{
|
|
|
|
struct ip6_pktopts *opts;
|
|
|
|
|
|
|
|
if (tp->t_in6pcb)
|
|
|
|
opts = tp->t_in6pcb->in6p_outputopts;
|
|
|
|
else
|
|
|
|
opts = NULL;
|
Eliminate address family-specific route caches (struct route, struct
route_in6, struct route_iso), replacing all caches with a struct
route.
The principle benefit of this change is that all of the protocol
families can benefit from route cache-invalidation, which is
necessary for correct routing. Route-cache invalidation fixes an
ancient PR, kern/3508, at long last; it fixes various other PRs,
also.
Discussions with and ideas from Joerg Sonnenberger influenced this
work tremendously. Of course, all design oversights and bugs are
mine.
DETAILS
1 I added to each address family a pool of sockaddrs. I have
introduced routines for allocating, copying, and duplicating,
and freeing sockaddrs:
struct sockaddr *sockaddr_alloc(sa_family_t af, int flags);
struct sockaddr *sockaddr_copy(struct sockaddr *dst,
const struct sockaddr *src);
struct sockaddr *sockaddr_dup(const struct sockaddr *src, int flags);
void sockaddr_free(struct sockaddr *sa);
sockaddr_alloc() returns either a sockaddr from the pool belonging
to the specified family, or NULL if the pool is exhausted. The
returned sockaddr has the right size for that family; sa_family
and sa_len fields are initialized to the family and sockaddr
length---e.g., sa_family = AF_INET and sa_len = sizeof(struct
sockaddr_in). sockaddr_free() puts the given sockaddr back into
its family's pool.
sockaddr_dup() and sockaddr_copy() work analogously to strdup()
and strcpy(), respectively. sockaddr_copy() KASSERTs that the
family of the destination and source sockaddrs are alike.
The 'flags' argumet for sockaddr_alloc() and sockaddr_dup() is
passed directly to pool_get(9).
2 I added routines for initializing sockaddrs in each address
family, sockaddr_in_init(), sockaddr_in6_init(), sockaddr_iso_init(),
etc. They are fairly self-explanatory.
3 structs route_in6 and route_iso are no more. All protocol families
use struct route. I have changed the route cache, 'struct route',
so that it does not contain storage space for a sockaddr. Instead,
struct route points to a sockaddr coming from the pool the sockaddr
belongs to. I added a new method to struct route, rtcache_setdst(),
for setting the cache destination:
int rtcache_setdst(struct route *, const struct sockaddr *);
rtcache_setdst() returns 0 on success, or ENOMEM if no memory is
available to create the sockaddr storage.
It is now possible for rtcache_getdst() to return NULL if, say,
rtcache_setdst() failed. I check the return value for NULL
everywhere in the kernel.
4 Each routing domain (struct domain) has a list of live route
caches, dom_rtcache. rtflushall(sa_family_t af) looks up the
domain indicated by 'af', walks the domain's list of route caches
and invalidates each one.
2007-05-03 00:40:22 +04:00
|
|
|
error = ip6_output(m, opts, ro, so->so_options & SO_DONTROUTE,
|
|
|
|
NULL, so, NULL);
|
1999-07-01 12:12:45 +04:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
#endif
|
1999-07-02 16:45:32 +04:00
|
|
|
default:
|
|
|
|
error = EAFNOSUPPORT;
|
|
|
|
break;
|
1999-07-01 12:12:45 +04:00
|
|
|
}
|
1993-03-21 12:45:37 +03:00
|
|
|
if (error) {
|
|
|
|
out:
|
|
|
|
if (error == ENOBUFS) {
|
2008-04-12 09:58:22 +04:00
|
|
|
TCP_STATINC(TCP_STAT_SELFQUENCH);
|
2000-10-17 07:06:42 +04:00
|
|
|
#ifdef INET
|
1999-07-01 12:12:45 +04:00
|
|
|
if (tp->t_inpcb)
|
|
|
|
tcp_quench(tp->t_inpcb, 0);
|
2000-10-17 07:06:42 +04:00
|
|
|
#endif
|
2000-07-28 06:39:45 +04:00
|
|
|
#ifdef INET6
|
2000-10-17 07:06:42 +04:00
|
|
|
if (tp->t_in6pcb)
|
1999-07-01 12:12:45 +04:00
|
|
|
tcp6_quench(tp->t_in6pcb, 0);
|
|
|
|
#endif
|
2001-09-10 08:24:24 +04:00
|
|
|
error = 0;
|
|
|
|
} else if ((error == EHOSTUNREACH || error == ENETDOWN) &&
|
|
|
|
TCPS_HAVERCVDSYN(tp->t_state)) {
|
1993-03-21 12:45:37 +03:00
|
|
|
tp->t_softerror = error;
|
2001-09-10 08:24:24 +04:00
|
|
|
error = 0;
|
1993-03-21 12:45:37 +03:00
|
|
|
}
|
2001-09-10 08:24:24 +04:00
|
|
|
|
Commit TCP SACK patches from Kentaro A. Karahone's patch at:
http://www.sigusr1.org/~kurahone/tcp-sack-netbsd-02152005.diff.gz
Fixes in that patch for pre-existing TCP pcb initializations were already
committed to NetBSD-current, so are not included in this commit.
The SACK patch has been observed to correctly negotiate and respond,
to SACKs in wide-area traffic.
There are two indepenently-observed, as-yet-unresolved anomalies:
First, seeing unexplained delays between in fast retransmission
(potentially explainable by an 0.2sec RTT between adjacent
ethernet/wifi NICs); and second, peculiar and unepxlained TCP
retransmits observed over an ath0 card.
After discussion with several interested developers, I'm committing
this now, as-is, for more eyes to use and look over. Current hypothesis
is that the anomalies above may in fact be due to link/level (hardware,
driver, HAL, firmware) abberations in the test setup, affecting both
Kentaro's wired-Ethernet NIC and in my two (different) WiFi NICs.
2005-02-28 19:20:59 +03:00
|
|
|
/* Back out the seqence number advance. */
|
|
|
|
if (sack_rxmit)
|
|
|
|
p->rxmit -= len;
|
|
|
|
|
2001-09-10 08:24:24 +04:00
|
|
|
/* Restart the delayed ACK timer, if necessary. */
|
|
|
|
if (tp->t_flags & TF_DELACK)
|
|
|
|
TCP_RESTART_DELACK(tp);
|
|
|
|
|
1993-03-21 12:45:37 +03:00
|
|
|
return (error);
|
|
|
|
}
|
2005-07-19 21:00:02 +04:00
|
|
|
|
|
|
|
if (packetlen > tp->t_pmtud_mtu_sent)
|
|
|
|
tp->t_pmtud_mtu_sent = packetlen;
|
|
|
|
|
2008-04-12 09:58:22 +04:00
|
|
|
tcps = TCP_STAT_GETREF();
|
|
|
|
tcps[TCP_STAT_SNDTOTAL]++;
|
1997-12-11 09:37:48 +03:00
|
|
|
if (tp->t_flags & TF_DELACK)
|
2008-04-12 09:58:22 +04:00
|
|
|
tcps[TCP_STAT_DELACK]++;
|
|
|
|
TCP_STAT_PUTREF();
|
1993-03-21 12:45:37 +03:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Data sent (as far as we can tell).
|
|
|
|
* If this advertises a larger window than any other segment,
|
|
|
|
* then remember the size of the advertised window.
|
|
|
|
* Any pending ACK has now been sent.
|
|
|
|
*/
|
|
|
|
if (win > 0 && SEQ_GT(tp->rcv_nxt+win, tp->rcv_adv))
|
|
|
|
tp->rcv_adv = tp->rcv_nxt + win;
|
1994-05-13 10:02:48 +04:00
|
|
|
tp->last_ack_sent = tp->rcv_nxt;
|
1997-12-31 06:31:23 +03:00
|
|
|
tp->t_flags &= ~TF_ACKNOW;
|
|
|
|
TCP_CLEAR_DELACK(tp);
|
1998-10-05 01:33:52 +04:00
|
|
|
#ifdef DIAGNOSTIC
|
|
|
|
if (maxburst < 0)
|
|
|
|
printf("tcp_output: maxburst exceeded by %d\n", -maxburst);
|
|
|
|
#endif
|
2006-10-09 20:27:07 +04:00
|
|
|
if (sendalot && (tp->t_congctl == &tcp_reno_ctl || --maxburst))
|
1993-03-21 12:45:37 +03:00
|
|
|
goto again;
|
|
|
|
return (0);
|
|
|
|
}
|
|
|
|
|
1994-01-09 02:07:16 +03:00
|
|
|
void
|
2005-02-04 02:39:32 +03:00
|
|
|
tcp_setpersist(struct tcpcb *tp)
|
1993-03-21 12:45:37 +03:00
|
|
|
{
|
2000-03-30 16:51:13 +04:00
|
|
|
int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> (1 + 2);
|
1998-05-06 05:21:20 +04:00
|
|
|
int nticks;
|
1993-03-21 12:45:37 +03:00
|
|
|
|
1998-05-06 05:21:20 +04:00
|
|
|
if (TCP_TIMER_ISARMED(tp, TCPT_REXMT))
|
1993-03-21 12:45:37 +03:00
|
|
|
panic("tcp_output REXMT");
|
|
|
|
/*
|
|
|
|
* Start/restart persistance timer.
|
|
|
|
*/
|
1998-03-20 01:29:33 +03:00
|
|
|
if (t < tp->t_rttmin)
|
|
|
|
t = tp->t_rttmin;
|
1998-05-06 05:21:20 +04:00
|
|
|
TCPT_RANGESET(nticks, t * tcp_backoff[tp->t_rxtshift],
|
1993-03-21 12:45:37 +03:00
|
|
|
TCPTV_PERSMIN, TCPTV_PERSMAX);
|
1998-05-06 05:21:20 +04:00
|
|
|
TCP_TIMER_ARM(tp, TCPT_PERSIST, nticks);
|
1993-03-21 12:45:37 +03:00
|
|
|
if (tp->t_rxtshift < TCP_MAXRXTSHIFT)
|
|
|
|
tp->t_rxtshift++;
|
|
|
|
}
|