1824 lines
44 KiB
C
1824 lines
44 KiB
C
/* $NetBSD: altq_rmclass.c,v 1.21 2008/07/15 16:18:08 christos Exp $ */
|
|
/* $KAME: altq_rmclass.c,v 1.19 2005/04/13 03:44:25 suz Exp $ */
|
|
|
|
/*
|
|
* Copyright (c) 1991-1997 Regents of the University of California.
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions
|
|
* are met:
|
|
* 1. Redistributions of source code must retain the above copyright
|
|
* notice, this list of conditions and the following disclaimer.
|
|
* 2. Redistributions in binary form must reproduce the above copyright
|
|
* notice, this list of conditions and the following disclaimer in the
|
|
* documentation and/or other materials provided with the distribution.
|
|
* 3. All advertising materials mentioning features or use of this software
|
|
* must display the following acknowledgement:
|
|
* This product includes software developed by the Network Research
|
|
* Group at Lawrence Berkeley Laboratory.
|
|
* 4. Neither the name of the University nor of the Laboratory may be used
|
|
* to endorse or promote products derived from this software without
|
|
* specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
|
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
* SUCH DAMAGE.
|
|
*
|
|
* LBL code modified by speer@eng.sun.com, May 1977.
|
|
* For questions and/or comments, please send mail to cbq@ee.lbl.gov
|
|
*/
|
|
|
|
#include <sys/cdefs.h>
|
|
__KERNEL_RCSID(0, "$NetBSD: altq_rmclass.c,v 1.21 2008/07/15 16:18:08 christos Exp $");
|
|
|
|
/* #ident "@(#)rm_class.c 1.48 97/12/05 SMI" */
|
|
|
|
#ifdef _KERNEL_OPT
|
|
#include "opt_altq.h"
|
|
#include "opt_inet.h"
|
|
#endif
|
|
|
|
#ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */
|
|
|
|
#include <sys/param.h>
|
|
#include <sys/malloc.h>
|
|
#include <sys/mbuf.h>
|
|
#include <sys/socket.h>
|
|
#include <sys/systm.h>
|
|
#include <sys/errno.h>
|
|
#include <sys/time.h>
|
|
#ifdef ALTQ3_COMPAT
|
|
#include <sys/kernel.h>
|
|
#endif
|
|
|
|
#include <net/if.h>
|
|
#ifdef ALTQ3_COMPAT
|
|
#include <netinet/in.h>
|
|
#include <netinet/in_systm.h>
|
|
#include <netinet/ip.h>
|
|
#endif
|
|
|
|
#include <altq/altq.h>
|
|
#include <altq/altq_rmclass.h>
|
|
#include <altq/altq_rmclass_debug.h>
|
|
#include <altq/altq_red.h>
|
|
#include <altq/altq_rio.h>
|
|
|
|
/*
|
|
* Local Macros
|
|
*/
|
|
|
|
#define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; }
|
|
|
|
/*
|
|
* Local routines.
|
|
*/
|
|
|
|
static int rmc_satisfied(struct rm_class *, struct timeval *);
|
|
static void rmc_wrr_set_weights(struct rm_ifdat *);
|
|
static void rmc_depth_compute(struct rm_class *);
|
|
static void rmc_depth_recompute(rm_class_t *);
|
|
|
|
static mbuf_t *_rmc_wrr_dequeue_next(struct rm_ifdat *, int);
|
|
static mbuf_t *_rmc_prr_dequeue_next(struct rm_ifdat *, int);
|
|
|
|
static int _rmc_addq(rm_class_t *, mbuf_t *);
|
|
static void _rmc_dropq(rm_class_t *);
|
|
static mbuf_t *_rmc_getq(rm_class_t *);
|
|
static mbuf_t *_rmc_pollq(rm_class_t *);
|
|
|
|
static int rmc_under_limit(struct rm_class *, struct timeval *);
|
|
static void rmc_tl_satisfied(struct rm_ifdat *, struct timeval *);
|
|
static void rmc_drop_action(struct rm_class *);
|
|
static void rmc_restart(struct rm_class *);
|
|
static void rmc_root_overlimit(struct rm_class *, struct rm_class *);
|
|
|
|
#define BORROW_OFFTIME
|
|
/*
|
|
* BORROW_OFFTIME (experimental):
|
|
* borrow the offtime of the class borrowing from.
|
|
* the reason is that when its own offtime is set, the class is unable
|
|
* to borrow much, especially when cutoff is taking effect.
|
|
* but when the borrowed class is overloaded (advidle is close to minidle),
|
|
* use the borrowing class's offtime to avoid overload.
|
|
*/
|
|
#define ADJUST_CUTOFF
|
|
/*
|
|
* ADJUST_CUTOFF (experimental):
|
|
* if no underlimit class is found due to cutoff, increase cutoff and
|
|
* retry the scheduling loop.
|
|
* also, don't invoke delay_actions while cutoff is taking effect,
|
|
* since a sleeping class won't have a chance to be scheduled in the
|
|
* next loop.
|
|
*
|
|
* now heuristics for setting the top-level variable (cutoff_) becomes:
|
|
* 1. if a packet arrives for a not-overlimit class, set cutoff
|
|
* to the depth of the class.
|
|
* 2. if cutoff is i, and a packet arrives for an overlimit class
|
|
* with an underlimit ancestor at a lower level than i (say j),
|
|
* then set cutoff to j.
|
|
* 3. at scheduling a packet, if there is no underlimit class
|
|
* due to the current cutoff level, increase cutoff by 1 and
|
|
* then try to schedule again.
|
|
*/
|
|
|
|
/*
|
|
* rm_class_t *
|
|
* rmc_newclass(...) - Create a new resource management class at priority
|
|
* 'pri' on the interface given by 'ifd'.
|
|
*
|
|
* nsecPerByte is the data rate of the interface in nanoseconds/byte.
|
|
* E.g., 800 for a 10Mb/s ethernet. If the class gets less
|
|
* than 100% of the bandwidth, this number should be the
|
|
* 'effective' rate for the class. Let f be the
|
|
* bandwidth fraction allocated to this class, and let
|
|
* nsPerByte be the data rate of the output link in
|
|
* nanoseconds/byte. Then nsecPerByte is set to
|
|
* nsPerByte / f. E.g., 1600 (= 800 / .5)
|
|
* for a class that gets 50% of an ethernet's bandwidth.
|
|
*
|
|
* action the routine to call when the class is over limit.
|
|
*
|
|
* maxq max allowable queue size for class (in packets).
|
|
*
|
|
* parent parent class pointer.
|
|
*
|
|
* borrow class to borrow from (should be either 'parent' or null).
|
|
*
|
|
* maxidle max value allowed for class 'idle' time estimate (this
|
|
* parameter determines how large an initial burst of packets
|
|
* can be before overlimit action is invoked.
|
|
*
|
|
* offtime how long 'delay' action will delay when class goes over
|
|
* limit (this parameter determines the steady-state burst
|
|
* size when a class is running over its limit).
|
|
*
|
|
* Maxidle and offtime have to be computed from the following: If the
|
|
* average packet size is s, the bandwidth fraction allocated to this
|
|
* class is f, we want to allow b packet bursts, and the gain of the
|
|
* averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then:
|
|
*
|
|
* ptime = s * nsPerByte * (1 - f) / f
|
|
* maxidle = ptime * (1 - g^b) / g^b
|
|
* minidle = -ptime * (1 / (f - 1))
|
|
* offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1)
|
|
*
|
|
* Operationally, it's convenient to specify maxidle & offtime in units
|
|
* independent of the link bandwidth so the maxidle & offtime passed to
|
|
* this routine are the above values multiplied by 8*f/(1000*nsPerByte).
|
|
* (The constant factor is a scale factor needed to make the parameters
|
|
* integers. This scaling also means that the 'unscaled' values of
|
|
* maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds,
|
|
* not nanoseconds.) Also note that the 'idle' filter computation keeps
|
|
* an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of
|
|
* maxidle also must be scaled upward by this value. Thus, the passed
|
|
* values for maxidle and offtime can be computed as follows:
|
|
*
|
|
* maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte)
|
|
* offtime = offtime * 8 / (1000 * nsecPerByte)
|
|
*
|
|
* When USE_HRTIME is employed, then maxidle and offtime become:
|
|
* maxidle = maxilde * (8.0 / nsecPerByte);
|
|
* offtime = offtime * (8.0 / nsecPerByte);
|
|
*/
|
|
struct rm_class *
|
|
rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte,
|
|
void (*action)(rm_class_t *, rm_class_t *), int maxq,
|
|
struct rm_class *parent, struct rm_class *borrow, u_int maxidle,
|
|
int minidle, u_int offtime, int pktsize, int flags)
|
|
{
|
|
struct rm_class *cl;
|
|
struct rm_class *peer;
|
|
int s;
|
|
|
|
if (pri >= RM_MAXPRIO)
|
|
return (NULL);
|
|
#ifndef ALTQ_RED
|
|
if (flags & RMCF_RED) {
|
|
#ifdef ALTQ_DEBUG
|
|
printf("rmc_newclass: RED not configured for CBQ!\n");
|
|
#endif
|
|
return (NULL);
|
|
}
|
|
#endif
|
|
#ifndef ALTQ_RIO
|
|
if (flags & RMCF_RIO) {
|
|
#ifdef ALTQ_DEBUG
|
|
printf("rmc_newclass: RIO not configured for CBQ!\n");
|
|
#endif
|
|
return (NULL);
|
|
}
|
|
#endif
|
|
|
|
cl = malloc(sizeof(struct rm_class), M_DEVBUF, M_WAITOK|M_ZERO);
|
|
if (cl == NULL)
|
|
return (NULL);
|
|
CALLOUT_INIT(&cl->callout_);
|
|
|
|
cl->q_ = malloc(sizeof(class_queue_t), M_DEVBUF, M_WAITOK|M_ZERO);
|
|
if (cl->q_ == NULL) {
|
|
free(cl, M_DEVBUF);
|
|
return (NULL);
|
|
}
|
|
|
|
/*
|
|
* Class initialization.
|
|
*/
|
|
cl->children_ = NULL;
|
|
cl->parent_ = parent;
|
|
cl->borrow_ = borrow;
|
|
cl->leaf_ = 1;
|
|
cl->ifdat_ = ifd;
|
|
cl->pri_ = pri;
|
|
cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
|
|
cl->depth_ = 0;
|
|
cl->qthresh_ = 0;
|
|
cl->ns_per_byte_ = nsecPerByte;
|
|
|
|
qlimit(cl->q_) = maxq;
|
|
qtype(cl->q_) = Q_DROPHEAD;
|
|
qlen(cl->q_) = 0;
|
|
cl->flags_ = flags;
|
|
|
|
#if 1 /* minidle is also scaled in ALTQ */
|
|
cl->minidle_ = (minidle * (int)nsecPerByte) / 8;
|
|
if (cl->minidle_ > 0)
|
|
cl->minidle_ = 0;
|
|
#else
|
|
cl->minidle_ = minidle;
|
|
#endif
|
|
cl->maxidle_ = (maxidle * nsecPerByte) / 8;
|
|
if (cl->maxidle_ == 0)
|
|
cl->maxidle_ = 1;
|
|
#if 1 /* offtime is also scaled in ALTQ */
|
|
cl->avgidle_ = cl->maxidle_;
|
|
cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
|
|
if (cl->offtime_ == 0)
|
|
cl->offtime_ = 1;
|
|
#else
|
|
cl->avgidle_ = 0;
|
|
cl->offtime_ = (offtime * nsecPerByte) / 8;
|
|
#endif
|
|
cl->overlimit = action;
|
|
|
|
#ifdef ALTQ_RED
|
|
if (flags & (RMCF_RED|RMCF_RIO)) {
|
|
int red_flags, red_pkttime;
|
|
|
|
red_flags = 0;
|
|
if (flags & RMCF_ECN)
|
|
red_flags |= REDF_ECN;
|
|
if (flags & RMCF_FLOWVALVE)
|
|
red_flags |= REDF_FLOWVALVE;
|
|
#ifdef ALTQ_RIO
|
|
if (flags & RMCF_CLEARDSCP)
|
|
red_flags |= RIOF_CLEARDSCP;
|
|
#endif
|
|
red_pkttime = nsecPerByte * pktsize / 1000;
|
|
|
|
if (flags & RMCF_RED) {
|
|
cl->red_ = red_alloc(0, 0,
|
|
qlimit(cl->q_) * 10/100,
|
|
qlimit(cl->q_) * 30/100,
|
|
red_flags, red_pkttime);
|
|
if (cl->red_ != NULL)
|
|
qtype(cl->q_) = Q_RED;
|
|
}
|
|
#ifdef ALTQ_RIO
|
|
else {
|
|
cl->red_ = (red_t *)rio_alloc(0, NULL,
|
|
red_flags, red_pkttime);
|
|
if (cl->red_ != NULL)
|
|
qtype(cl->q_) = Q_RIO;
|
|
}
|
|
#endif
|
|
}
|
|
#endif /* ALTQ_RED */
|
|
|
|
/*
|
|
* put the class into the class tree
|
|
*/
|
|
s = splnet();
|
|
if ((peer = ifd->active_[pri]) != NULL) {
|
|
/* find the last class at this pri */
|
|
cl->peer_ = peer;
|
|
while (peer->peer_ != ifd->active_[pri])
|
|
peer = peer->peer_;
|
|
peer->peer_ = cl;
|
|
} else {
|
|
ifd->active_[pri] = cl;
|
|
cl->peer_ = cl;
|
|
}
|
|
|
|
if (cl->parent_) {
|
|
cl->next_ = parent->children_;
|
|
parent->children_ = cl;
|
|
parent->leaf_ = 0;
|
|
}
|
|
|
|
/*
|
|
* Compute the depth of this class and its ancestors in the class
|
|
* hierarchy.
|
|
*/
|
|
rmc_depth_compute(cl);
|
|
|
|
/*
|
|
* If CBQ's WRR is enabled, then initialize the class WRR state.
|
|
*/
|
|
if (ifd->wrr_) {
|
|
ifd->num_[pri]++;
|
|
ifd->alloc_[pri] += cl->allotment_;
|
|
rmc_wrr_set_weights(ifd);
|
|
}
|
|
splx(s);
|
|
return (cl);
|
|
}
|
|
|
|
int
|
|
rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle,
|
|
int minidle, u_int offtime, int pktsize)
|
|
{
|
|
struct rm_ifdat *ifd;
|
|
u_int old_allotment;
|
|
int s;
|
|
|
|
ifd = cl->ifdat_;
|
|
old_allotment = cl->allotment_;
|
|
|
|
s = splnet();
|
|
cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */
|
|
cl->qthresh_ = 0;
|
|
cl->ns_per_byte_ = nsecPerByte;
|
|
|
|
qlimit(cl->q_) = maxq;
|
|
|
|
#if 1 /* minidle is also scaled in ALTQ */
|
|
cl->minidle_ = (minidle * nsecPerByte) / 8;
|
|
if (cl->minidle_ > 0)
|
|
cl->minidle_ = 0;
|
|
#else
|
|
cl->minidle_ = minidle;
|
|
#endif
|
|
cl->maxidle_ = (maxidle * nsecPerByte) / 8;
|
|
if (cl->maxidle_ == 0)
|
|
cl->maxidle_ = 1;
|
|
#if 1 /* offtime is also scaled in ALTQ */
|
|
cl->avgidle_ = cl->maxidle_;
|
|
cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN;
|
|
if (cl->offtime_ == 0)
|
|
cl->offtime_ = 1;
|
|
#else
|
|
cl->avgidle_ = 0;
|
|
cl->offtime_ = (offtime * nsecPerByte) / 8;
|
|
#endif
|
|
|
|
/*
|
|
* If CBQ's WRR is enabled, then initialize the class WRR state.
|
|
*/
|
|
if (ifd->wrr_) {
|
|
ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment;
|
|
rmc_wrr_set_weights(ifd);
|
|
}
|
|
splx(s);
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* static void
|
|
* rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes
|
|
* the appropriate run robin weights for the CBQ weighted round robin
|
|
* algorithm.
|
|
*
|
|
* Returns: NONE
|
|
*/
|
|
|
|
static void
|
|
rmc_wrr_set_weights(struct rm_ifdat *ifd)
|
|
{
|
|
int i;
|
|
struct rm_class *cl, *clh;
|
|
|
|
for (i = 0; i < RM_MAXPRIO; i++) {
|
|
/*
|
|
* This is inverted from that of the simulator to
|
|
* maintain precision.
|
|
*/
|
|
if (ifd->num_[i] == 0)
|
|
ifd->M_[i] = 0;
|
|
else
|
|
ifd->M_[i] = ifd->alloc_[i] /
|
|
(ifd->num_[i] * ifd->maxpkt_);
|
|
/*
|
|
* Compute the weighted allotment for each class.
|
|
* This takes the expensive div instruction out
|
|
* of the main loop for the wrr scheduling path.
|
|
* These only get recomputed when a class comes or
|
|
* goes.
|
|
*/
|
|
if (ifd->active_[i] != NULL) {
|
|
clh = cl = ifd->active_[i];
|
|
do {
|
|
/* safe-guard for slow link or alloc_ == 0 */
|
|
if (ifd->M_[i] == 0)
|
|
cl->w_allotment_ = 0;
|
|
else
|
|
cl->w_allotment_ = cl->allotment_ /
|
|
ifd->M_[i];
|
|
cl = cl->peer_;
|
|
} while ((cl != NULL) && (cl != clh));
|
|
}
|
|
}
|
|
}
|
|
|
|
int
|
|
rmc_get_weight(struct rm_ifdat *ifd, int pri)
|
|
{
|
|
if ((pri >= 0) && (pri < RM_MAXPRIO))
|
|
return (ifd->M_[pri]);
|
|
else
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* static void
|
|
* rmc_depth_compute(struct rm_class *cl) - This function computes the
|
|
* appropriate depth of class 'cl' and its ancestors.
|
|
*
|
|
* Returns: NONE
|
|
*/
|
|
|
|
static void
|
|
rmc_depth_compute(struct rm_class *cl)
|
|
{
|
|
rm_class_t *t = cl, *p;
|
|
|
|
/*
|
|
* Recompute the depth for the branch of the tree.
|
|
*/
|
|
while (t != NULL) {
|
|
p = t->parent_;
|
|
if (p && (t->depth_ >= p->depth_)) {
|
|
p->depth_ = t->depth_ + 1;
|
|
t = p;
|
|
} else
|
|
t = NULL;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* static void
|
|
* rmc_depth_recompute(struct rm_class *cl) - This function re-computes
|
|
* the depth of the tree after a class has been deleted.
|
|
*
|
|
* Returns: NONE
|
|
*/
|
|
|
|
static void
|
|
rmc_depth_recompute(rm_class_t *cl)
|
|
{
|
|
#if 1 /* ALTQ */
|
|
rm_class_t *p, *t;
|
|
|
|
p = cl;
|
|
while (p != NULL) {
|
|
if ((t = p->children_) == NULL) {
|
|
p->depth_ = 0;
|
|
} else {
|
|
int cdepth = 0;
|
|
|
|
while (t != NULL) {
|
|
if (t->depth_ > cdepth)
|
|
cdepth = t->depth_;
|
|
t = t->next_;
|
|
}
|
|
|
|
if (p->depth_ == cdepth + 1)
|
|
/* no change to this parent */
|
|
return;
|
|
|
|
p->depth_ = cdepth + 1;
|
|
}
|
|
|
|
p = p->parent_;
|
|
}
|
|
#else
|
|
rm_class_t *t;
|
|
|
|
if (cl->depth_ >= 1) {
|
|
if (cl->children_ == NULL) {
|
|
cl->depth_ = 0;
|
|
} else if ((t = cl->children_) != NULL) {
|
|
while (t != NULL) {
|
|
if (t->children_ != NULL)
|
|
rmc_depth_recompute(t);
|
|
t = t->next_;
|
|
}
|
|
} else
|
|
rmc_depth_compute(cl);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
/*
|
|
* void
|
|
* rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This
|
|
* function deletes a class from the link-sharing structure and frees
|
|
* all resources associated with the class.
|
|
*
|
|
* Returns: NONE
|
|
*/
|
|
|
|
void
|
|
rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl)
|
|
{
|
|
struct rm_class *p, *head, *previous;
|
|
int s;
|
|
|
|
ASSERT(cl->children_ == NULL);
|
|
|
|
if (cl->sleeping_)
|
|
CALLOUT_STOP(&cl->callout_);
|
|
|
|
s = splnet();
|
|
/*
|
|
* Free packets in the packet queue.
|
|
* XXX - this may not be a desired behavior. Packets should be
|
|
* re-queued.
|
|
*/
|
|
rmc_dropall(cl);
|
|
|
|
/*
|
|
* If the class has a parent, then remove the class from the
|
|
* class from the parent's children chain.
|
|
*/
|
|
if (cl->parent_ != NULL) {
|
|
head = cl->parent_->children_;
|
|
p = previous = head;
|
|
if (head->next_ == NULL) {
|
|
ASSERT(head == cl);
|
|
cl->parent_->children_ = NULL;
|
|
cl->parent_->leaf_ = 1;
|
|
} else while (p != NULL) {
|
|
if (p == cl) {
|
|
if (cl == head)
|
|
cl->parent_->children_ = cl->next_;
|
|
else
|
|
previous->next_ = cl->next_;
|
|
cl->next_ = NULL;
|
|
p = NULL;
|
|
} else {
|
|
previous = p;
|
|
p = p->next_;
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Delete class from class priority peer list.
|
|
*/
|
|
if ((p = ifd->active_[cl->pri_]) != NULL) {
|
|
/*
|
|
* If there is more than one member of this priority
|
|
* level, then look for class(cl) in the priority level.
|
|
*/
|
|
if (p != p->peer_) {
|
|
while (p->peer_ != cl)
|
|
p = p->peer_;
|
|
p->peer_ = cl->peer_;
|
|
|
|
if (ifd->active_[cl->pri_] == cl)
|
|
ifd->active_[cl->pri_] = cl->peer_;
|
|
} else {
|
|
ASSERT(p == cl);
|
|
ifd->active_[cl->pri_] = NULL;
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Recompute the WRR weights.
|
|
*/
|
|
if (ifd->wrr_) {
|
|
ifd->alloc_[cl->pri_] -= cl->allotment_;
|
|
ifd->num_[cl->pri_]--;
|
|
rmc_wrr_set_weights(ifd);
|
|
}
|
|
|
|
/*
|
|
* Re-compute the depth of the tree.
|
|
*/
|
|
#if 1 /* ALTQ */
|
|
rmc_depth_recompute(cl->parent_);
|
|
#else
|
|
rmc_depth_recompute(ifd->root_);
|
|
#endif
|
|
|
|
splx(s);
|
|
|
|
/*
|
|
* Free the class structure.
|
|
*/
|
|
if (cl->red_ != NULL) {
|
|
#ifdef ALTQ_RIO
|
|
if (q_is_rio(cl->q_))
|
|
rio_destroy((rio_t *)cl->red_);
|
|
#endif
|
|
#ifdef ALTQ_RED
|
|
if (q_is_red(cl->q_))
|
|
red_destroy(cl->red_);
|
|
#endif
|
|
}
|
|
free(cl->q_, M_DEVBUF);
|
|
free(cl, M_DEVBUF);
|
|
}
|
|
|
|
|
|
/*
|
|
* int
|
|
* rmc_init(...) - Initialize the resource management data structures
|
|
* associated with the output portion of interface 'ifp'. 'ifd' is
|
|
* where the structures will be built (for backwards compatibility, the
|
|
* structures aren't kept in the ifnet struct). 'nsecPerByte'
|
|
* gives the link speed (inverse of bandwidth) in nanoseconds/byte.
|
|
* 'restart' is the driver-specific routine that the generic 'delay
|
|
* until under limit' action will call to restart output. `maxq'
|
|
* is the queue size of the 'link' & 'default' classes. 'maxqueued'
|
|
* is the maximum number of packets that the resource management
|
|
* code will allow to be queued 'downstream' (this is typically 1).
|
|
*
|
|
* Returns: 0 on success
|
|
*/
|
|
|
|
int
|
|
rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte,
|
|
void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle,
|
|
int minidle, u_int offtime, int flags)
|
|
{
|
|
int i, mtu;
|
|
|
|
/*
|
|
* Initialize the CBQ tracing/debug facility.
|
|
*/
|
|
CBQTRACEINIT();
|
|
|
|
mtu = ifq->altq_ifp->if_mtu;
|
|
if (mtu < 1) {
|
|
printf("altq: %s: invalid MTU (interface not initialized?)\n",
|
|
ifq->altq_ifp->if_xname);
|
|
return (EINVAL);
|
|
}
|
|
|
|
(void)memset((char *)ifd, 0, sizeof (*ifd));
|
|
ifd->ifq_ = ifq;
|
|
ifd->restart = restart;
|
|
ifd->maxqueued_ = maxqueued;
|
|
ifd->ns_per_byte_ = nsecPerByte;
|
|
ifd->maxpkt_ = mtu;
|
|
ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0;
|
|
ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0;
|
|
#if 1
|
|
ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16;
|
|
if (mtu * nsecPerByte > 10 * 1000000)
|
|
ifd->maxiftime_ /= 4;
|
|
#endif
|
|
|
|
reset_cutoff(ifd);
|
|
CBQTRACE(rmc_init, 'INIT', ifd->cutoff_);
|
|
|
|
/*
|
|
* Initialize the CBQ's WRR state.
|
|
*/
|
|
for (i = 0; i < RM_MAXPRIO; i++) {
|
|
ifd->alloc_[i] = 0;
|
|
ifd->M_[i] = 0;
|
|
ifd->num_[i] = 0;
|
|
ifd->na_[i] = 0;
|
|
ifd->active_[i] = NULL;
|
|
}
|
|
|
|
/*
|
|
* Initialize current packet state.
|
|
*/
|
|
ifd->qi_ = 0;
|
|
ifd->qo_ = 0;
|
|
for (i = 0; i < RM_MAXQUEUED; i++) {
|
|
ifd->class_[i] = NULL;
|
|
ifd->curlen_[i] = 0;
|
|
ifd->borrowed_[i] = NULL;
|
|
}
|
|
|
|
/*
|
|
* Create the root class of the link-sharing structure.
|
|
*/
|
|
if ((ifd->root_ = rmc_newclass(0, ifd,
|
|
nsecPerByte,
|
|
rmc_root_overlimit, maxq, 0, 0,
|
|
maxidle, minidle, offtime,
|
|
0, 0)) == NULL) {
|
|
printf("rmc_init: root class not allocated\n");
|
|
return (ENOMEM);
|
|
}
|
|
ifd->root_->depth_ = 0;
|
|
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* void
|
|
* rmc_queue_packet(struct rm_class *cl, mbuf_t *m) - Add packet given by
|
|
* mbuf 'm' to queue for resource class 'cl'. This routine is called
|
|
* by a driver's if_output routine. This routine must be called with
|
|
* output packet completion interrupts locked out (to avoid racing with
|
|
* rmc_dequeue_next).
|
|
*
|
|
* Returns: 0 on successful queueing
|
|
* -1 when packet drop occurs
|
|
*/
|
|
int
|
|
rmc_queue_packet(struct rm_class *cl, mbuf_t *m)
|
|
{
|
|
struct timeval now;
|
|
struct rm_ifdat *ifd = cl->ifdat_;
|
|
int cpri = cl->pri_;
|
|
int is_empty = qempty(cl->q_);
|
|
|
|
RM_GETTIME(now);
|
|
if (ifd->cutoff_ > 0) {
|
|
if (TV_LT(&cl->undertime_, &now)) {
|
|
if (ifd->cutoff_ > cl->depth_)
|
|
ifd->cutoff_ = cl->depth_;
|
|
CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_);
|
|
}
|
|
#if 1 /* ALTQ */
|
|
else {
|
|
/*
|
|
* the class is overlimit. if the class has
|
|
* underlimit ancestors, set cutoff to the lowest
|
|
* depth among them.
|
|
*/
|
|
struct rm_class *borrow = cl->borrow_;
|
|
|
|
while (borrow != NULL &&
|
|
borrow->depth_ < ifd->cutoff_) {
|
|
if (TV_LT(&borrow->undertime_, &now)) {
|
|
ifd->cutoff_ = borrow->depth_;
|
|
CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_);
|
|
break;
|
|
}
|
|
borrow = borrow->borrow_;
|
|
}
|
|
}
|
|
#else /* !ALTQ */
|
|
else if ((ifd->cutoff_ > 1) && cl->borrow_) {
|
|
if (TV_LT(&cl->borrow_->undertime_, &now)) {
|
|
ifd->cutoff_ = cl->borrow_->depth_;
|
|
CBQTRACE(rmc_queue_packet, 'ffob',
|
|
cl->borrow_->depth_);
|
|
}
|
|
}
|
|
#endif /* !ALTQ */
|
|
}
|
|
|
|
if (_rmc_addq(cl, m) < 0)
|
|
/* failed */
|
|
return (-1);
|
|
|
|
if (is_empty) {
|
|
CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle);
|
|
ifd->na_[cpri]++;
|
|
}
|
|
|
|
if (qlen(cl->q_) > qlimit(cl->q_)) {
|
|
/* note: qlimit can be set to 0 or 1 */
|
|
rmc_drop_action(cl);
|
|
return (-1);
|
|
}
|
|
return (0);
|
|
}
|
|
|
|
/*
|
|
* void
|
|
* rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all
|
|
* classes to see if there are satified.
|
|
*/
|
|
|
|
static void
|
|
rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now)
|
|
{
|
|
int i;
|
|
rm_class_t *p, *bp;
|
|
|
|
for (i = RM_MAXPRIO - 1; i >= 0; i--) {
|
|
if ((bp = ifd->active_[i]) != NULL) {
|
|
p = bp;
|
|
do {
|
|
if (!rmc_satisfied(p, now)) {
|
|
ifd->cutoff_ = p->depth_;
|
|
return;
|
|
}
|
|
p = p->peer_;
|
|
} while (p != bp);
|
|
}
|
|
}
|
|
|
|
reset_cutoff(ifd);
|
|
}
|
|
|
|
/*
|
|
* rmc_satisfied - Return 1 of the class is satisfied. O, otherwise.
|
|
*/
|
|
|
|
static int
|
|
rmc_satisfied(struct rm_class *cl, struct timeval *now)
|
|
{
|
|
rm_class_t *p;
|
|
|
|
if (cl == NULL)
|
|
return (1);
|
|
if (TV_LT(now, &cl->undertime_))
|
|
return (1);
|
|
if (cl->depth_ == 0) {
|
|
if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_))
|
|
return (0);
|
|
else
|
|
return (1);
|
|
}
|
|
if (cl->children_ != NULL) {
|
|
p = cl->children_;
|
|
while (p != NULL) {
|
|
if (!rmc_satisfied(p, now))
|
|
return (0);
|
|
p = p->next_;
|
|
}
|
|
}
|
|
|
|
return (1);
|
|
}
|
|
|
|
/*
|
|
* Return 1 if class 'cl' is under limit or can borrow from a parent,
|
|
* 0 if overlimit. As a side-effect, this routine will invoke the
|
|
* class overlimit action if the class if overlimit.
|
|
*/
|
|
|
|
static int
|
|
rmc_under_limit(struct rm_class *cl, struct timeval *now)
|
|
{
|
|
rm_class_t *p = cl;
|
|
rm_class_t *top;
|
|
struct rm_ifdat *ifd = cl->ifdat_;
|
|
|
|
ifd->borrowed_[ifd->qi_] = NULL;
|
|
/*
|
|
* If cl is the root class, then always return that it is
|
|
* underlimit. Otherwise, check to see if the class is underlimit.
|
|
*/
|
|
if (cl->parent_ == NULL)
|
|
return (1);
|
|
|
|
if (cl->sleeping_) {
|
|
if (TV_LT(now, &cl->undertime_))
|
|
return (0);
|
|
|
|
CALLOUT_STOP(&cl->callout_);
|
|
cl->sleeping_ = 0;
|
|
cl->undertime_.tv_sec = 0;
|
|
return (1);
|
|
}
|
|
|
|
top = NULL;
|
|
while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) {
|
|
if (((cl = cl->borrow_) == NULL) ||
|
|
(cl->depth_ > ifd->cutoff_)) {
|
|
#ifdef ADJUST_CUTOFF
|
|
if (cl != NULL)
|
|
/* cutoff is taking effect, just
|
|
return false without calling
|
|
the delay action. */
|
|
return (0);
|
|
#endif
|
|
#ifdef BORROW_OFFTIME
|
|
/*
|
|
* check if the class can borrow offtime too.
|
|
* borrow offtime from the top of the borrow
|
|
* chain if the top class is not overloaded.
|
|
*/
|
|
if (cl != NULL) {
|
|
/* cutoff is taking effect, use this class as top. */
|
|
top = cl;
|
|
CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_);
|
|
}
|
|
if (top != NULL && top->avgidle_ == top->minidle_)
|
|
top = NULL;
|
|
p->overtime_ = *now;
|
|
(p->overlimit)(p, top);
|
|
#else
|
|
p->overtime_ = *now;
|
|
(p->overlimit)(p, NULL);
|
|
#endif
|
|
return (0);
|
|
}
|
|
top = cl;
|
|
}
|
|
|
|
if (cl != p)
|
|
ifd->borrowed_[ifd->qi_] = cl;
|
|
return (1);
|
|
}
|
|
|
|
/*
|
|
* _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to
|
|
* Packet-by-packet round robin.
|
|
*
|
|
* The heart of the weighted round-robin scheduler, which decides which
|
|
* class next gets to send a packet. Highest priority first, then
|
|
* weighted round-robin within priorites.
|
|
*
|
|
* Each able-to-send class gets to send until its byte allocation is
|
|
* exhausted. Thus, the active pointer is only changed after a class has
|
|
* exhausted its allocation.
|
|
*
|
|
* If the scheduler finds no class that is underlimit or able to borrow,
|
|
* then the first class found that had a nonzero queue and is allowed to
|
|
* borrow gets to send.
|
|
*/
|
|
|
|
static mbuf_t *
|
|
_rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op)
|
|
{
|
|
struct rm_class *cl = NULL, *first = NULL;
|
|
u_int deficit;
|
|
int cpri;
|
|
mbuf_t *m;
|
|
struct timeval now;
|
|
|
|
RM_GETTIME(now);
|
|
|
|
/*
|
|
* if the driver polls the top of the queue and then removes
|
|
* the polled packet, we must return the same packet.
|
|
*/
|
|
if (op == ALTDQ_REMOVE && ifd->pollcache_) {
|
|
cl = ifd->pollcache_;
|
|
cpri = cl->pri_;
|
|
if (ifd->efficient_) {
|
|
/* check if this class is overlimit */
|
|
if (cl->undertime_.tv_sec != 0 &&
|
|
rmc_under_limit(cl, &now) == 0)
|
|
first = cl;
|
|
}
|
|
ifd->pollcache_ = NULL;
|
|
goto _wrr_out;
|
|
}
|
|
else {
|
|
/* mode == ALTDQ_POLL || pollcache == NULL */
|
|
ifd->pollcache_ = NULL;
|
|
ifd->borrowed_[ifd->qi_] = NULL;
|
|
}
|
|
#ifdef ADJUST_CUTOFF
|
|
_again:
|
|
#endif
|
|
for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
|
|
if (ifd->na_[cpri] == 0)
|
|
continue;
|
|
deficit = 0;
|
|
/*
|
|
* Loop through twice for a priority level, if some class
|
|
* was unable to send a packet the first round because
|
|
* of the weighted round-robin mechanism.
|
|
* During the second loop at this level, deficit==2.
|
|
* (This second loop is not needed if for every class,
|
|
* "M[cl->pri_])" times "cl->allotment" is greater than
|
|
* the byte size for the largest packet in the class.)
|
|
*/
|
|
_wrr_loop:
|
|
cl = ifd->active_[cpri];
|
|
ASSERT(cl != NULL);
|
|
do {
|
|
if ((deficit < 2) && (cl->bytes_alloc_ <= 0))
|
|
cl->bytes_alloc_ += cl->w_allotment_;
|
|
if (!qempty(cl->q_)) {
|
|
if ((cl->undertime_.tv_sec == 0) ||
|
|
rmc_under_limit(cl, &now)) {
|
|
if (cl->bytes_alloc_ > 0 || deficit > 1)
|
|
goto _wrr_out;
|
|
|
|
/* underlimit but no alloc */
|
|
deficit = 1;
|
|
#if 1
|
|
ifd->borrowed_[ifd->qi_] = NULL;
|
|
#endif
|
|
}
|
|
else if (first == NULL && cl->borrow_ != NULL)
|
|
first = cl; /* borrowing candidate */
|
|
}
|
|
|
|
cl->bytes_alloc_ = 0;
|
|
cl = cl->peer_;
|
|
} while (cl != ifd->active_[cpri]);
|
|
|
|
if (deficit == 1) {
|
|
/* first loop found an underlimit class with deficit */
|
|
/* Loop on same priority level, with new deficit. */
|
|
deficit = 2;
|
|
goto _wrr_loop;
|
|
}
|
|
}
|
|
|
|
#ifdef ADJUST_CUTOFF
|
|
/*
|
|
* no underlimit class found. if cutoff is taking effect,
|
|
* increase cutoff and try again.
|
|
*/
|
|
if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
|
|
ifd->cutoff_++;
|
|
CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_);
|
|
goto _again;
|
|
}
|
|
#endif /* ADJUST_CUTOFF */
|
|
/*
|
|
* If LINK_EFFICIENCY is turned on, then the first overlimit
|
|
* class we encounter will send a packet if all the classes
|
|
* of the link-sharing structure are overlimit.
|
|
*/
|
|
reset_cutoff(ifd);
|
|
CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_);
|
|
|
|
if (!ifd->efficient_ || first == NULL)
|
|
return (NULL);
|
|
|
|
cl = first;
|
|
cpri = cl->pri_;
|
|
#if 0 /* too time-consuming for nothing */
|
|
if (cl->sleeping_)
|
|
CALLOUT_STOP(&cl->callout_);
|
|
cl->sleeping_ = 0;
|
|
cl->undertime_.tv_sec = 0;
|
|
#endif
|
|
ifd->borrowed_[ifd->qi_] = cl->borrow_;
|
|
ifd->cutoff_ = cl->borrow_->depth_;
|
|
|
|
/*
|
|
* Deque the packet and do the book keeping...
|
|
*/
|
|
_wrr_out:
|
|
if (op == ALTDQ_REMOVE) {
|
|
m = _rmc_getq(cl);
|
|
if (m == NULL)
|
|
panic("_rmc_wrr_dequeue_next");
|
|
if (qempty(cl->q_))
|
|
ifd->na_[cpri]--;
|
|
|
|
/*
|
|
* Update class statistics and link data.
|
|
*/
|
|
if (cl->bytes_alloc_ > 0)
|
|
cl->bytes_alloc_ -= m_pktlen(m);
|
|
|
|
if ((cl->bytes_alloc_ <= 0) || first == cl)
|
|
ifd->active_[cl->pri_] = cl->peer_;
|
|
else
|
|
ifd->active_[cl->pri_] = cl;
|
|
|
|
ifd->class_[ifd->qi_] = cl;
|
|
ifd->curlen_[ifd->qi_] = m_pktlen(m);
|
|
ifd->now_[ifd->qi_] = now;
|
|
ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
|
|
ifd->queued_++;
|
|
} else {
|
|
/* mode == ALTDQ_PPOLL */
|
|
m = _rmc_pollq(cl);
|
|
ifd->pollcache_ = cl;
|
|
}
|
|
return (m);
|
|
}
|
|
|
|
/*
|
|
* Dequeue & return next packet from the highest priority class that
|
|
* has a packet to send & has enough allocation to send it. This
|
|
* routine is called by a driver whenever it needs a new packet to
|
|
* output.
|
|
*/
|
|
static mbuf_t *
|
|
_rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op)
|
|
{
|
|
mbuf_t *m;
|
|
int cpri;
|
|
struct rm_class *cl, *first = NULL;
|
|
struct timeval now;
|
|
|
|
RM_GETTIME(now);
|
|
|
|
/*
|
|
* if the driver polls the top of the queue and then removes
|
|
* the polled packet, we must return the same packet.
|
|
*/
|
|
if (op == ALTDQ_REMOVE && ifd->pollcache_) {
|
|
cl = ifd->pollcache_;
|
|
cpri = cl->pri_;
|
|
ifd->pollcache_ = NULL;
|
|
goto _prr_out;
|
|
} else {
|
|
/* mode == ALTDQ_POLL || pollcache == NULL */
|
|
ifd->pollcache_ = NULL;
|
|
ifd->borrowed_[ifd->qi_] = NULL;
|
|
}
|
|
#ifdef ADJUST_CUTOFF
|
|
_again:
|
|
#endif
|
|
for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) {
|
|
if (ifd->na_[cpri] == 0)
|
|
continue;
|
|
cl = ifd->active_[cpri];
|
|
ASSERT(cl != NULL);
|
|
do {
|
|
if (!qempty(cl->q_)) {
|
|
if ((cl->undertime_.tv_sec == 0) ||
|
|
rmc_under_limit(cl, &now))
|
|
goto _prr_out;
|
|
if (first == NULL && cl->borrow_ != NULL)
|
|
first = cl;
|
|
}
|
|
cl = cl->peer_;
|
|
} while (cl != ifd->active_[cpri]);
|
|
}
|
|
|
|
#ifdef ADJUST_CUTOFF
|
|
/*
|
|
* no underlimit class found. if cutoff is taking effect, increase
|
|
* cutoff and try again.
|
|
*/
|
|
if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) {
|
|
ifd->cutoff_++;
|
|
goto _again;
|
|
}
|
|
#endif /* ADJUST_CUTOFF */
|
|
/*
|
|
* If LINK_EFFICIENCY is turned on, then the first overlimit
|
|
* class we encounter will send a packet if all the classes
|
|
* of the link-sharing structure are overlimit.
|
|
*/
|
|
reset_cutoff(ifd);
|
|
if (!ifd->efficient_ || first == NULL)
|
|
return (NULL);
|
|
|
|
cl = first;
|
|
cpri = cl->pri_;
|
|
#if 0 /* too time-consuming for nothing */
|
|
if (cl->sleeping_)
|
|
CALLOUT_STOP(&cl->callout_);
|
|
cl->sleeping_ = 0;
|
|
cl->undertime_.tv_sec = 0;
|
|
#endif
|
|
ifd->borrowed_[ifd->qi_] = cl->borrow_;
|
|
ifd->cutoff_ = cl->borrow_->depth_;
|
|
|
|
/*
|
|
* Deque the packet and do the book keeping...
|
|
*/
|
|
_prr_out:
|
|
if (op == ALTDQ_REMOVE) {
|
|
m = _rmc_getq(cl);
|
|
if (m == NULL)
|
|
panic("_rmc_prr_dequeue_next");
|
|
if (qempty(cl->q_))
|
|
ifd->na_[cpri]--;
|
|
|
|
ifd->active_[cpri] = cl->peer_;
|
|
|
|
ifd->class_[ifd->qi_] = cl;
|
|
ifd->curlen_[ifd->qi_] = m_pktlen(m);
|
|
ifd->now_[ifd->qi_] = now;
|
|
ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_;
|
|
ifd->queued_++;
|
|
} else {
|
|
/* mode == ALTDQ_POLL */
|
|
m = _rmc_pollq(cl);
|
|
ifd->pollcache_ = cl;
|
|
}
|
|
return (m);
|
|
}
|
|
|
|
/*
|
|
* mbuf_t *
|
|
* rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function
|
|
* is invoked by the packet driver to get the next packet to be
|
|
* dequeued and output on the link. If WRR is enabled, then the
|
|
* WRR dequeue next routine will determine the next packet to sent.
|
|
* Otherwise, packet-by-packet round robin is invoked.
|
|
*
|
|
* Returns: NULL, if a packet is not available or if all
|
|
* classes are overlimit.
|
|
*
|
|
* Otherwise, Pointer to the next packet.
|
|
*/
|
|
|
|
mbuf_t *
|
|
rmc_dequeue_next(struct rm_ifdat *ifd, int mode)
|
|
{
|
|
if (ifd->queued_ >= ifd->maxqueued_)
|
|
return (NULL);
|
|
else if (ifd->wrr_)
|
|
return (_rmc_wrr_dequeue_next(ifd, mode));
|
|
else
|
|
return (_rmc_prr_dequeue_next(ifd, mode));
|
|
}
|
|
|
|
/*
|
|
* Update the utilization estimate for the packet that just completed.
|
|
* The packet's class & the parent(s) of that class all get their
|
|
* estimators updated. This routine is called by the driver's output-
|
|
* packet-completion interrupt service routine.
|
|
*/
|
|
|
|
/*
|
|
* a macro to approximate "divide by 1000" that gives 0.000999,
|
|
* if a value has enough effective digits.
|
|
* (on pentium, mul takes 9 cycles but div takes 46!)
|
|
*/
|
|
#define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17))
|
|
void
|
|
rmc_update_class_util(struct rm_ifdat *ifd)
|
|
{
|
|
int idle, avgidle, pktlen;
|
|
int pkt_time, tidle;
|
|
rm_class_t *cl, *borrowed;
|
|
rm_class_t *borrows;
|
|
struct timeval *nowp;
|
|
|
|
/*
|
|
* Get the most recent completed class.
|
|
*/
|
|
if ((cl = ifd->class_[ifd->qo_]) == NULL)
|
|
return;
|
|
|
|
pktlen = ifd->curlen_[ifd->qo_];
|
|
borrowed = ifd->borrowed_[ifd->qo_];
|
|
borrows = borrowed;
|
|
|
|
PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
|
|
|
|
/*
|
|
* Run estimator on class and its ancestors.
|
|
*/
|
|
/*
|
|
* rm_update_class_util is designed to be called when the
|
|
* transfer is completed from a xmit complete interrupt,
|
|
* but most drivers don't implement an upcall for that.
|
|
* so, just use estimated completion time.
|
|
* as a result, ifd->qi_ and ifd->qo_ are always synced.
|
|
*/
|
|
nowp = &ifd->now_[ifd->qo_];
|
|
/* get pkt_time (for link) in usec */
|
|
#if 1 /* use approximation */
|
|
pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_;
|
|
pkt_time = NSEC_TO_USEC(pkt_time);
|
|
#else
|
|
pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000;
|
|
#endif
|
|
#if 1 /* ALTQ4PPP */
|
|
if (TV_LT(nowp, &ifd->ifnow_)) {
|
|
int iftime;
|
|
|
|
/*
|
|
* make sure the estimated completion time does not go
|
|
* too far. it can happen when the link layer supports
|
|
* data compression or the interface speed is set to
|
|
* a much lower value.
|
|
*/
|
|
TV_DELTA(&ifd->ifnow_, nowp, iftime);
|
|
if (iftime+pkt_time < ifd->maxiftime_) {
|
|
TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
|
|
} else {
|
|
TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_);
|
|
}
|
|
} else {
|
|
TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
|
|
}
|
|
#else
|
|
if (TV_LT(nowp, &ifd->ifnow_)) {
|
|
TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_);
|
|
} else {
|
|
TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_);
|
|
}
|
|
#endif
|
|
|
|
while (cl != NULL) {
|
|
TV_DELTA(&ifd->ifnow_, &cl->last_, idle);
|
|
if (idle >= 2000000)
|
|
/*
|
|
* this class is idle enough, reset avgidle.
|
|
* (TV_DELTA returns 2000000 us when delta is large.)
|
|
*/
|
|
cl->avgidle_ = cl->maxidle_;
|
|
|
|
/* get pkt_time (for class) in usec */
|
|
#if 1 /* use approximation */
|
|
pkt_time = pktlen * cl->ns_per_byte_;
|
|
pkt_time = NSEC_TO_USEC(pkt_time);
|
|
#else
|
|
pkt_time = pktlen * cl->ns_per_byte_ / 1000;
|
|
#endif
|
|
idle -= pkt_time;
|
|
|
|
avgidle = cl->avgidle_;
|
|
avgidle += idle - (avgidle >> RM_FILTER_GAIN);
|
|
cl->avgidle_ = avgidle;
|
|
|
|
/* Are we overlimit ? */
|
|
if (avgidle <= 0) {
|
|
CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle);
|
|
#if 1 /* ALTQ */
|
|
/*
|
|
* need some lower bound for avgidle, otherwise
|
|
* a borrowing class gets unbounded penalty.
|
|
*/
|
|
if (avgidle < cl->minidle_)
|
|
avgidle = cl->avgidle_ = cl->minidle_;
|
|
#endif
|
|
/* set next idle to make avgidle 0 */
|
|
tidle = pkt_time +
|
|
(((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN);
|
|
TV_ADD_DELTA(nowp, tidle, &cl->undertime_);
|
|
++cl->stats_.over;
|
|
} else {
|
|
cl->avgidle_ =
|
|
(avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle;
|
|
cl->undertime_.tv_sec = 0;
|
|
if (cl->sleeping_) {
|
|
CALLOUT_STOP(&cl->callout_);
|
|
cl->sleeping_ = 0;
|
|
}
|
|
}
|
|
|
|
if (borrows != NULL) {
|
|
if (borrows != cl)
|
|
++cl->stats_.borrows;
|
|
else
|
|
borrows = NULL;
|
|
}
|
|
cl->last_ = ifd->ifnow_;
|
|
cl->last_pkttime_ = pkt_time;
|
|
|
|
#if 1
|
|
if (cl->parent_ == NULL) {
|
|
/* take stats of root class */
|
|
PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen);
|
|
}
|
|
#endif
|
|
|
|
cl = cl->parent_;
|
|
}
|
|
|
|
/*
|
|
* Check to see if cutoff needs to set to a new level.
|
|
*/
|
|
cl = ifd->class_[ifd->qo_];
|
|
if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) {
|
|
#if 1 /* ALTQ */
|
|
if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) {
|
|
rmc_tl_satisfied(ifd, nowp);
|
|
CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
|
|
} else {
|
|
ifd->cutoff_ = borrowed->depth_;
|
|
CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
|
|
}
|
|
#else /* !ALTQ */
|
|
if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) {
|
|
reset_cutoff(ifd);
|
|
#ifdef notdef
|
|
rmc_tl_satisfied(ifd, &now);
|
|
#endif
|
|
CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_);
|
|
} else {
|
|
ifd->cutoff_ = borrowed->depth_;
|
|
CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_);
|
|
}
|
|
#endif /* !ALTQ */
|
|
}
|
|
|
|
/*
|
|
* Release class slot
|
|
*/
|
|
ifd->borrowed_[ifd->qo_] = NULL;
|
|
ifd->class_[ifd->qo_] = NULL;
|
|
ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_;
|
|
ifd->queued_--;
|
|
}
|
|
|
|
/*
|
|
* void
|
|
* rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific)
|
|
* over-limit action routines. These get invoked by rmc_under_limit()
|
|
* if a class with packets to send if over its bandwidth limit & can't
|
|
* borrow from a parent class.
|
|
*
|
|
* Returns: NONE
|
|
*/
|
|
|
|
static void
|
|
rmc_drop_action(struct rm_class *cl)
|
|
{
|
|
struct rm_ifdat *ifd = cl->ifdat_;
|
|
|
|
ASSERT(qlen(cl->q_) > 0);
|
|
_rmc_dropq(cl);
|
|
if (qempty(cl->q_))
|
|
ifd->na_[cl->pri_]--;
|
|
}
|
|
|
|
void
|
|
rmc_dropall(struct rm_class *cl)
|
|
{
|
|
struct rm_ifdat *ifd = cl->ifdat_;
|
|
|
|
if (!qempty(cl->q_)) {
|
|
_flushq(cl->q_);
|
|
|
|
ifd->na_[cl->pri_]--;
|
|
}
|
|
}
|
|
|
|
#if (__FreeBSD_version > 300000)
|
|
static int tvhzto(struct timeval *);
|
|
|
|
static int
|
|
tvhzto(struct timeval *tv)
|
|
{
|
|
struct timeval t2;
|
|
|
|
getmicrotime(&t2);
|
|
t2.tv_sec = tv->tv_sec - t2.tv_sec;
|
|
t2.tv_usec = tv->tv_usec - t2.tv_usec;
|
|
return (tvtohz(&t2));
|
|
}
|
|
#endif /* __FreeBSD_version > 300000 */
|
|
|
|
/*
|
|
* void
|
|
* rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ
|
|
* delay action routine. It is invoked via rmc_under_limit when the
|
|
* packet is discoverd to be overlimit.
|
|
*
|
|
* If the delay action is result of borrow class being overlimit, then
|
|
* delay for the offtime of the borrowing class that is overlimit.
|
|
*
|
|
* Returns: NONE
|
|
*/
|
|
|
|
void
|
|
rmc_delay_action(struct rm_class *cl, struct rm_class *borrow)
|
|
{
|
|
int ndelay, t, extradelay;
|
|
|
|
cl->stats_.overactions++;
|
|
TV_DELTA(&cl->undertime_, &cl->overtime_, ndelay);
|
|
#ifndef BORROW_OFFTIME
|
|
ndelay += cl->offtime_;
|
|
#endif
|
|
|
|
if (!cl->sleeping_) {
|
|
CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle);
|
|
#ifdef BORROW_OFFTIME
|
|
if (borrow != NULL)
|
|
extradelay = borrow->offtime_;
|
|
else
|
|
#endif
|
|
extradelay = cl->offtime_;
|
|
|
|
#ifdef ALTQ
|
|
/*
|
|
* XXX recalculate suspend time:
|
|
* current undertime is (tidle + pkt_time) calculated
|
|
* from the last transmission.
|
|
* tidle: time required to bring avgidle back to 0
|
|
* pkt_time: target waiting time for this class
|
|
* we need to replace pkt_time by offtime
|
|
*/
|
|
extradelay -= cl->last_pkttime_;
|
|
#endif
|
|
if (extradelay > 0) {
|
|
TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_);
|
|
ndelay += extradelay;
|
|
}
|
|
|
|
cl->sleeping_ = 1;
|
|
cl->stats_.delays++;
|
|
|
|
/*
|
|
* Since packets are phased randomly with respect to the
|
|
* clock, 1 tick (the next clock tick) can be an arbitrarily
|
|
* short time so we have to wait for at least two ticks.
|
|
* NOTE: If there's no other traffic, we need the timer as
|
|
* a 'backstop' to restart this class.
|
|
*/
|
|
if (ndelay > tick * 2) {
|
|
#ifdef __FreeBSD__
|
|
/* FreeBSD rounds up the tick */
|
|
t = tvhzto(&cl->undertime_);
|
|
#else
|
|
/* other BSDs round down the tick */
|
|
t = tvhzto(&cl->undertime_) + 1;
|
|
#endif
|
|
} else
|
|
t = 2;
|
|
CALLOUT_RESET(&cl->callout_, t,
|
|
(timeout_t *)rmc_restart, (void *)cl);
|
|
}
|
|
}
|
|
|
|
/*
|
|
* void
|
|
* rmc_restart() - is just a helper routine for rmc_delay_action -- it is
|
|
* called by the system timer code & is responsible checking if the
|
|
* class is still sleeping (it might have been restarted as a side
|
|
* effect of the queue scan on a packet arrival) and, if so, restarting
|
|
* output for the class. Inspecting the class state & restarting output
|
|
* require locking the class structure. In general the driver is
|
|
* responsible for locking but this is the only routine that is not
|
|
* called directly or indirectly from the interface driver so it has
|
|
* know about system locking conventions. Under bsd, locking is done
|
|
* by raising IPL to splnet so that's what's implemented here. On a
|
|
* different system this would probably need to be changed.
|
|
*
|
|
* Returns: NONE
|
|
*/
|
|
|
|
static void
|
|
rmc_restart(struct rm_class *cl)
|
|
{
|
|
struct rm_ifdat *ifd = cl->ifdat_;
|
|
int s;
|
|
|
|
s = splnet();
|
|
if (cl->sleeping_) {
|
|
cl->sleeping_ = 0;
|
|
cl->undertime_.tv_sec = 0;
|
|
|
|
if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) {
|
|
CBQTRACE(rmc_restart, 'trts', cl->stats_.handle);
|
|
(ifd->restart)(ifd->ifq_);
|
|
}
|
|
}
|
|
splx(s);
|
|
}
|
|
|
|
/*
|
|
* void
|
|
* rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit
|
|
* handling routine for the root class of the link sharing structure.
|
|
*
|
|
* Returns: NONE
|
|
*/
|
|
|
|
static void
|
|
rmc_root_overlimit(struct rm_class *cl,
|
|
struct rm_class *borrow)
|
|
{
|
|
panic("rmc_root_overlimit");
|
|
}
|
|
|
|
/*
|
|
* Packet Queue handling routines. Eventually, this is to localize the
|
|
* effects on the code whether queues are red queues or droptail
|
|
* queues.
|
|
*/
|
|
|
|
static int
|
|
_rmc_addq(rm_class_t *cl, mbuf_t *m)
|
|
{
|
|
#ifdef ALTQ_RIO
|
|
if (q_is_rio(cl->q_))
|
|
return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_);
|
|
#endif
|
|
#ifdef ALTQ_RED
|
|
if (q_is_red(cl->q_))
|
|
return red_addq(cl->red_, cl->q_, m, cl->pktattr_);
|
|
#endif /* ALTQ_RED */
|
|
|
|
if (cl->flags_ & RMCF_CLEARDSCP)
|
|
write_dsfield(m, cl->pktattr_, 0);
|
|
|
|
_addq(cl->q_, m);
|
|
return (0);
|
|
}
|
|
|
|
/* note: _rmc_dropq is not called for red */
|
|
static void
|
|
_rmc_dropq(rm_class_t *cl)
|
|
{
|
|
mbuf_t *m;
|
|
|
|
if ((m = _getq(cl->q_)) != NULL)
|
|
m_freem(m);
|
|
}
|
|
|
|
static mbuf_t *
|
|
_rmc_getq(rm_class_t *cl)
|
|
{
|
|
#ifdef ALTQ_RIO
|
|
if (q_is_rio(cl->q_))
|
|
return rio_getq((rio_t *)cl->red_, cl->q_);
|
|
#endif
|
|
#ifdef ALTQ_RED
|
|
if (q_is_red(cl->q_))
|
|
return red_getq(cl->red_, cl->q_);
|
|
#endif
|
|
return _getq(cl->q_);
|
|
}
|
|
|
|
static mbuf_t *
|
|
_rmc_pollq(rm_class_t *cl)
|
|
{
|
|
return qhead(cl->q_);
|
|
}
|
|
|
|
#ifdef CBQ_TRACE
|
|
|
|
struct cbqtrace cbqtrace_buffer[NCBQTRACE+1];
|
|
struct cbqtrace *cbqtrace_ptr = NULL;
|
|
int cbqtrace_count;
|
|
|
|
/*
|
|
* DDB hook to trace cbq events:
|
|
* the last 1024 events are held in a circular buffer.
|
|
* use "call cbqtrace_dump(N)" to display 20 events from Nth event.
|
|
*/
|
|
void cbqtrace_dump(int);
|
|
static char *rmc_funcname(void *);
|
|
|
|
static struct rmc_funcs {
|
|
void *func;
|
|
char *name;
|
|
} rmc_funcs[] =
|
|
{
|
|
rmc_init, "rmc_init",
|
|
rmc_queue_packet, "rmc_queue_packet",
|
|
rmc_under_limit, "rmc_under_limit",
|
|
rmc_update_class_util, "rmc_update_class_util",
|
|
rmc_delay_action, "rmc_delay_action",
|
|
rmc_restart, "rmc_restart",
|
|
_rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next",
|
|
NULL, NULL
|
|
};
|
|
|
|
static char *
|
|
rmc_funcname(void *func)
|
|
{
|
|
struct rmc_funcs *fp;
|
|
|
|
for (fp = rmc_funcs; fp->func != NULL; fp++)
|
|
if (fp->func == func)
|
|
return (fp->name);
|
|
return ("unknown");
|
|
}
|
|
|
|
void
|
|
cbqtrace_dump(int counter)
|
|
{
|
|
int i, *p;
|
|
char *cp;
|
|
|
|
counter = counter % NCBQTRACE;
|
|
p = (int *)&cbqtrace_buffer[counter];
|
|
|
|
for (i=0; i<20; i++) {
|
|
printf("[0x%x] ", *p++);
|
|
printf("%s: ", rmc_funcname((void *)*p++));
|
|
cp = (char *)p++;
|
|
printf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]);
|
|
printf("%d\n",*p++);
|
|
|
|
if (p >= (int *)&cbqtrace_buffer[NCBQTRACE])
|
|
p = (int *)cbqtrace_buffer;
|
|
}
|
|
}
|
|
#endif /* CBQ_TRACE */
|
|
#endif /* ALTQ_CBQ */
|
|
|
|
#if defined(ALTQ_CBQ) || defined(ALTQ_RED) || defined(ALTQ_RIO) || defined(ALTQ_HFSC) || defined(ALTQ_PRIQ)
|
|
#if !defined(__GNUC__) || defined(ALTQ_DEBUG)
|
|
|
|
void
|
|
_addq(class_queue_t *q, mbuf_t *m)
|
|
{
|
|
mbuf_t *m0;
|
|
|
|
if ((m0 = qtail(q)) != NULL)
|
|
m->m_nextpkt = m0->m_nextpkt;
|
|
else
|
|
m0 = m;
|
|
m0->m_nextpkt = m;
|
|
qtail(q) = m;
|
|
qlen(q)++;
|
|
}
|
|
|
|
mbuf_t *
|
|
_getq(class_queue_t *q)
|
|
{
|
|
mbuf_t *m, *m0;
|
|
|
|
if ((m = qtail(q)) == NULL)
|
|
return (NULL);
|
|
if ((m0 = m->m_nextpkt) != m)
|
|
m->m_nextpkt = m0->m_nextpkt;
|
|
else {
|
|
ASSERT(qlen(q) == 1);
|
|
qtail(q) = NULL;
|
|
}
|
|
qlen(q)--;
|
|
m0->m_nextpkt = NULL;
|
|
return (m0);
|
|
}
|
|
|
|
/* drop a packet at the tail of the queue */
|
|
mbuf_t *
|
|
_getq_tail(class_queue_t *q)
|
|
{
|
|
mbuf_t *m, *m0, *prev;
|
|
|
|
if ((m = m0 = qtail(q)) == NULL)
|
|
return NULL;
|
|
do {
|
|
prev = m0;
|
|
m0 = m0->m_nextpkt;
|
|
} while (m0 != m);
|
|
prev->m_nextpkt = m->m_nextpkt;
|
|
if (prev == m) {
|
|
ASSERT(qlen(q) == 1);
|
|
qtail(q) = NULL;
|
|
} else
|
|
qtail(q) = prev;
|
|
qlen(q)--;
|
|
m->m_nextpkt = NULL;
|
|
return (m);
|
|
}
|
|
|
|
/* randomly select a packet in the queue */
|
|
mbuf_t *
|
|
_getq_random(class_queue_t *q)
|
|
{
|
|
struct mbuf *m;
|
|
int i, n;
|
|
|
|
if ((m = qtail(q)) == NULL)
|
|
return NULL;
|
|
if (m->m_nextpkt == m) {
|
|
ASSERT(qlen(q) == 1);
|
|
qtail(q) = NULL;
|
|
} else {
|
|
struct mbuf *prev = NULL;
|
|
|
|
n = arc4random() % qlen(q) + 1;
|
|
for (i = 0; i < n; i++) {
|
|
prev = m;
|
|
m = m->m_nextpkt;
|
|
}
|
|
prev->m_nextpkt = m->m_nextpkt;
|
|
if (m == qtail(q))
|
|
qtail(q) = prev;
|
|
}
|
|
qlen(q)--;
|
|
m->m_nextpkt = NULL;
|
|
return (m);
|
|
}
|
|
|
|
void
|
|
_removeq(class_queue_t *q, mbuf_t *m)
|
|
{
|
|
mbuf_t *m0, *prev;
|
|
|
|
m0 = qtail(q);
|
|
do {
|
|
prev = m0;
|
|
m0 = m0->m_nextpkt;
|
|
} while (m0 != m);
|
|
prev->m_nextpkt = m->m_nextpkt;
|
|
if (prev == m)
|
|
qtail(q) = NULL;
|
|
else if (qtail(q) == m)
|
|
qtail(q) = prev;
|
|
qlen(q)--;
|
|
}
|
|
|
|
void
|
|
_flushq(class_queue_t *q)
|
|
{
|
|
mbuf_t *m;
|
|
|
|
while ((m = _getq(q)) != NULL)
|
|
m_freem(m);
|
|
ASSERT(qlen(q) == 0);
|
|
}
|
|
|
|
#endif /* !__GNUC__ || ALTQ_DEBUG */
|
|
#endif /* ALTQ_CBQ || ALTQ_RED || ALTQ_RIO || ALTQ_HFSC || ALTQ_PRIQ */
|