/* $NetBSD: tcp_sack.c,v 1.26 2011/04/14 15:54:31 yamt Exp $ */ /* * Copyright (c) 2005 The NetBSD Foundation, Inc. * All rights reserved. * * This code is derived from software contributed to The NetBSD Foundation * by Kentaro A. Kurahone. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ /* * Copyright (c) 1982, 1986, 1988, 1990, 1993, 1994, 1995 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 4. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * @(#)tcp_sack.c 8.12 (Berkeley) 5/24/95 * $FreeBSD: src/sys/netinet/tcp_sack.c,v 1.3.2.2 2004/12/25 23:02:57 rwatson Exp $ */ /* * @@(#)COPYRIGHT 1.1 (NRL) 17 January 1995 * * NRL grants permission for redistribution and use in source and binary * forms, with or without modification, of the software and documentation * created at NRL provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgements: * This product includes software developed by the University of * California, Berkeley and its contributors. * This product includes software developed at the Information * Technology Division, US Naval Research Laboratory. * 4. Neither the name of the NRL nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * * THE SOFTWARE PROVIDED BY NRL IS PROVIDED BY NRL AND CONTRIBUTORS ``AS * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NRL OR * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * The views and conclusions contained in the software and documentation * are those of the authors and should not be interpreted as representing * official policies, either expressed or implied, of the US Naval * Research Laboratory (NRL). */ #include __KERNEL_RCSID(0, "$NetBSD: tcp_sack.c,v 1.26 2011/04/14 15:54:31 yamt Exp $"); #include "opt_inet.h" #include "opt_ipsec.h" #include "opt_inet_csum.h" #include "opt_tcp_debug.h" #include "opt_ddb.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef INET6 #ifndef INET #include #endif #include #include #include #include #include #include #include #endif #ifndef INET6 /* always need ip6.h for IP6_EXTHDR_GET */ #include #endif #include #include #include #include #include #include #include #include /* SACK block pool. */ static struct pool sackhole_pool; void tcp_sack_init() { pool_init(&sackhole_pool, sizeof(struct sackhole), 0, 0, 0, "sackholepl", NULL, IPL_SOFTNET); } static struct sackhole * sack_allochole(struct tcpcb *tp) { struct sackhole *hole; if (tp->snd_numholes >= tcp_sack_tp_maxholes || tcp_sack_globalholes >= tcp_sack_globalmaxholes) { return NULL; } hole = pool_get(&sackhole_pool, PR_NOWAIT); if (hole == NULL) { return NULL; } tp->snd_numholes++; tcp_sack_globalholes++; return hole; } static struct sackhole * sack_inserthole(struct tcpcb *tp, tcp_seq start, tcp_seq end, struct sackhole *prev) { struct sackhole *hole; hole = sack_allochole(tp); if (hole == NULL) { return NULL; } hole->start = hole->rxmit = start; hole->end = end; if (prev != NULL) { TAILQ_INSERT_AFTER(&tp->snd_holes, prev, hole, sackhole_q); } else { TAILQ_INSERT_TAIL(&tp->snd_holes, hole, sackhole_q); } return hole; } static struct sackhole * sack_removehole(struct tcpcb *tp, struct sackhole *hole) { struct sackhole *next; next = TAILQ_NEXT(hole, sackhole_q); tp->snd_numholes--; tcp_sack_globalholes--; TAILQ_REMOVE(&tp->snd_holes, hole, sackhole_q); pool_put(&sackhole_pool, hole); return next; } /* * tcp_new_dsack: record the reception of a duplicated segment. */ void tcp_new_dsack(struct tcpcb *tp, tcp_seq seq, u_int32_t len) { if (TCP_SACK_ENABLED(tp)) { tp->rcv_dsack_block.left = seq; tp->rcv_dsack_block.right = seq + len; tp->rcv_sack_flags |= TCPSACK_HAVED; } } /* * tcp_sack_option: parse the given SACK option and update the scoreboard. */ void tcp_sack_option(struct tcpcb *tp, const struct tcphdr *th, const u_char *cp, int optlen) { struct sackblk t_sack_block[(MAX_TCPOPTLEN - 2) / (sizeof(u_int32_t) * 2)]; struct sackblk *sack = NULL; struct sackhole *cur = NULL; struct sackhole *tmp = NULL; const char *lp = cp + 2; int i, j, num_sack_blks; tcp_seq left, right, acked; /* * If we aren't processing SACK responses, this is not an ACK * or the peer sends us a sack option with invalid length, don't * update the scoreboard. */ if (!TCP_SACK_ENABLED(tp) || ((th->th_flags & TH_ACK) == 0) || (optlen % 8 != 2 || optlen < 10)) { return; } /* * If we don't want any SACK holes to be allocated, just return. */ if (tcp_sack_globalmaxholes == 0 || tcp_sack_tp_maxholes == 0) { return; } /* If the ACK is outside [snd_una, snd_max], ignore the SACK options. */ if (SEQ_LT(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max)) return; /* * Extract SACK blocks. * * Note that t_sack_block is sorted so that we only need to do * one pass over the sequence number space. (SACK "fast-path") */ num_sack_blks = optlen / 8; acked = (SEQ_GT(th->th_ack, tp->snd_una)) ? th->th_ack : tp->snd_una; for (i = 0; i < num_sack_blks; i++, lp += sizeof(uint32_t) * 2) { memcpy(&left, lp, sizeof(uint32_t)); memcpy(&right, lp + sizeof(uint32_t), sizeof(uint32_t)); left = ntohl(left); right = ntohl(right); if (SEQ_LEQ(right, acked) || SEQ_GT(right, tp->snd_max) || SEQ_GEQ(left, right)) { /* SACK entry that's old, or invalid. */ i--; num_sack_blks--; continue; } /* Insertion sort. */ for (j = i; (j > 0) && SEQ_LT(left, t_sack_block[j - 1].left); j--) { t_sack_block[j].left = t_sack_block[j - 1].left; t_sack_block[j].right = t_sack_block[j - 1].right; } t_sack_block[j].left = left; t_sack_block[j].right = right; } /* Update the scoreboard. */ cur = TAILQ_FIRST(&tp->snd_holes); for (i = 0; i < num_sack_blks; i++) { sack = &t_sack_block[i]; /* * FACK TCP. Update snd_fack so we can enter Fast * Recovery early. */ if (SEQ_GEQ(sack->right, tp->snd_fack)) tp->snd_fack = sack->right; if (TAILQ_EMPTY(&tp->snd_holes)) { /* First hole. */ cur = sack_inserthole(tp, th->th_ack, sack->left, NULL); if (cur == NULL) { /* ENOBUFS, bail out*/ return; } tp->rcv_lastsack = sack->right; continue; /* With next sack block */ } /* Go through the list of holes. */ while (cur) { if (SEQ_LEQ(sack->right, cur->start)) /* SACKs data before the current hole */ break; /* No use going through more holes */ if (SEQ_GEQ(sack->left, cur->end)) { /* SACKs data beyond the current hole */ cur = TAILQ_NEXT(cur, sackhole_q); continue; } if (SEQ_LEQ(sack->left, cur->start)) { /* Data acks at least the beginning of hole */ if (SEQ_GEQ(sack->right, cur->end)) { /* Acks entire hole, so delete hole */ cur = sack_removehole(tp, cur); break; } /* Otherwise, move start of hole forward */ cur->start = sack->right; cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); break; } if (SEQ_GEQ(sack->right, cur->end)) { /* Move end of hole backward. */ cur->end = sack->left; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); cur = TAILQ_NEXT(cur, sackhole_q); break; } if (SEQ_LT(cur->start, sack->left) && SEQ_GT(cur->end, sack->right)) { /* * ACKs some data in middle of a hole; need to * split current hole */ tmp = sack_inserthole(tp, sack->right, cur->end, cur); if (tmp == NULL) { return; } tmp->rxmit = SEQ_MAX(cur->rxmit, tmp->start); cur->end = sack->left; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); cur = tmp; break; } } /* At this point, we have reached the tail of the list. */ if (SEQ_LT(tp->rcv_lastsack, sack->left)) { /* * Need to append new hole at end. */ cur = sack_inserthole(tp, tp->rcv_lastsack, sack->left, NULL); if (cur == NULL) { return; } } if (SEQ_LT(tp->rcv_lastsack, sack->right)) { tp->rcv_lastsack = sack->right; } } } /* * tcp_del_sackholes: remove holes covered by a cumulative ACK. */ void tcp_del_sackholes(struct tcpcb *tp, const struct tcphdr *th) { /* Max because this could be an older ack that just arrived. */ tcp_seq lastack = SEQ_GT(th->th_ack, tp->snd_una) ? th->th_ack : tp->snd_una; struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); while (cur) { if (SEQ_LEQ(cur->end, lastack)) { cur = sack_removehole(tp, cur); } else if (SEQ_LT(cur->start, lastack)) { cur->start = lastack; if (SEQ_LT(cur->rxmit, cur->start)) cur->rxmit = cur->start; break; } else break; } } /* * tcp_free_sackholes: clear the scoreboard. */ void tcp_free_sackholes(struct tcpcb *tp) { struct sackhole *sack; /* Free up the SACK hole list. */ while ((sack = TAILQ_FIRST(&tp->snd_holes)) != NULL) { sack_removehole(tp, sack); } KASSERT(tp->snd_numholes == 0); } /* * Implements the SACK response to a new ack, checking for partial acks * in fast recovery. */ void tcp_sack_newack(struct tcpcb *tp, const struct tcphdr *th) { if (tp->t_partialacks < 0) { /* * Not in fast recovery. Reset the duplicate ack * counter. */ tp->t_dupacks = 0; } else if (SEQ_LT(th->th_ack, tp->snd_recover)) { /* * Partial ack handling within a sack recovery episode. * Keeping this very simple for now. When a partial ack * is received, force snd_cwnd to a value that will allow * the sender to transmit no more than 2 segments. * If necessary, a fancier scheme can be adopted at a * later point, but for now, the goal is to prevent the * sender from bursting a large amount of data in the midst * of sack recovery. */ int num_segs = 1; int sack_bytes_rxmt = 0; tp->t_partialacks++; TCP_TIMER_DISARM(tp, TCPT_REXMT); tp->t_rtttime = 0; /* * send one or 2 segments based on how much new data was acked */ if (((th->th_ack - tp->snd_una) / tp->t_segsz) > 2) num_segs = 2; (void)tcp_sack_output(tp, &sack_bytes_rxmt); tp->snd_cwnd = sack_bytes_rxmt + (tp->snd_nxt - tp->sack_newdata) + num_segs * tp->t_segsz; tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } else { /* * Complete ack, inflate the congestion window to * ssthresh and exit fast recovery. * * Window inflation should have left us with approx. * snd_ssthresh outstanding data. But in case we * would be inclined to send a burst, better to do * it via the slow start mechanism. */ if (SEQ_SUB(tp->snd_max, th->th_ack) < tp->snd_ssthresh) tp->snd_cwnd = SEQ_SUB(tp->snd_max, th->th_ack) + tp->t_segsz; else tp->snd_cwnd = tp->snd_ssthresh; tp->t_partialacks = -1; tp->t_dupacks = 0; if (SEQ_GT(th->th_ack, tp->snd_fack)) tp->snd_fack = th->th_ack; } } /* * Returns pointer to a sackhole if there are any pending retransmissions; * NULL otherwise. */ struct sackhole * tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt) { struct sackhole *cur = NULL; if (!TCP_SACK_ENABLED(tp)) return (NULL); *sack_bytes_rexmt = 0; TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) { if (SEQ_LT(cur->rxmit, cur->end)) { if (SEQ_LT(cur->rxmit, tp->snd_una)) { /* old SACK hole */ continue; } *sack_bytes_rexmt += (cur->rxmit - cur->start); break; } *sack_bytes_rexmt += (cur->rxmit - cur->start); } return (cur); } /* * After a timeout, the SACK list may be rebuilt. This SACK information * should be used to avoid retransmitting SACKed data. This function * traverses the SACK list to see if snd_nxt should be moved forward. */ void tcp_sack_adjust(struct tcpcb *tp) { struct sackhole *cur = TAILQ_FIRST(&tp->snd_holes); struct sackhole *n = NULL; if (TAILQ_EMPTY(&tp->snd_holes)) return; /* No holes */ if (SEQ_GEQ(tp->snd_nxt, tp->rcv_lastsack)) return; /* We're already beyond any SACKed blocks */ /* * Two cases for which we want to advance snd_nxt: * i) snd_nxt lies between end of one hole and beginning of another * ii) snd_nxt lies between end of last hole and rcv_lastsack */ while ((n = TAILQ_NEXT(cur, sackhole_q)) != NULL) { if (SEQ_LT(tp->snd_nxt, cur->end)) return; if (SEQ_GEQ(tp->snd_nxt, n->start)) cur = n; else { tp->snd_nxt = n->start; return; } } if (SEQ_LT(tp->snd_nxt, cur->end)) return; tp->snd_nxt = tp->rcv_lastsack; return; } /* * tcp_sack_numblks: return the number of SACK blocks to send. */ int tcp_sack_numblks(const struct tcpcb *tp) { int numblks; if (!TCP_SACK_ENABLED(tp)) { return 0; } numblks = (((tp->rcv_sack_flags & TCPSACK_HAVED) != 0) ? 1 : 0) + tp->t_segqlen; if (numblks == 0) { return 0; } if (numblks > TCP_SACK_MAX) { numblks = TCP_SACK_MAX; } return numblks; } #if defined(DDB) void sack_dump(const struct tcpcb *); void sack_dump(const struct tcpcb *tp) { const struct sackhole *cur; printf("snd_una=%" PRIu32 ", snd_max=%" PRIu32 "\n", tp->snd_una, tp->snd_max); printf("rcv_lastsack=%" PRIu32 ", snd_fack=%" PRIu32 "\n", tp->rcv_lastsack, tp->snd_fack); printf("numholes=%d\n", tp->snd_numholes); TAILQ_FOREACH(cur, &tp->snd_holes, sackhole_q) { printf("\t%" PRIu32 "-%" PRIu32 ", rxmit=%" PRIu32 "\n", cur->start, cur->end, cur->rxmit); } } #endif /* defined(DDB) */